In [1]:
import os

import numpy as np
import pandas as pd

import datetime
import time

import random

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

In [2]:
def generate_customer_profiles_table(n_customers, random_state=0):
    
    np.random.seed(random_state)
        
    customer_id_properties=[]
    
    # Generate customer properties from random distributions 
    for cliente_id in range(n_customers):
        
        x_pos_cliente = np.random.uniform(0,100)
        y_pos_cliente = np.random.uniform(0,100)
        
        monto_promedio = np.random.uniform(5000,100000) # Arbitrary (but sensible) value 
        desviacion_promedio = monto_promedio/2 # Arbitrary (but sensible) value
        
        promedio_tx_por_dia = np.random.uniform(0,4) # Arbitrary (but sensible) value 
        
        customer_id_properties.append([cliente_id,
                                      x_pos_cliente, y_pos_cliente,
                                      monto_promedio, desviacion_promedio,
                                      promedio_tx_por_dia])
        
    customer_profiles_table = pd.DataFrame(customer_id_properties, columns=['CLIENTE_ID',
                                                                      'x_pos_cliente', 'y_pos_cliente',
                                                                      'monto_promedio', 'desviacion_promedio',
                                                                      'promedio_tx_por_dia'])
    
    return customer_profiles_table

In [3]:
def generate_terminal_profiles_table(n_terminals, random_state=0):
    np.random.seed(random_state)
    
    terminal_id_properties = []
    group_locations = []
    ################
    ###### RE-ESCRIBIR CODE UBICACIONES #####################################################
    #hola
    # Define el número de grupos y la cantidad mínima y máxima de posiciones por grupo
    n_groups = np.random.randint(2, 6)  # Número aleatorio de grupos
    min_positions_per_group = 2
    max_positions_per_group = int(n_terminals / n_groups) + 1
    
    # Genera las ubicaciones para cada grupo
    for group in range(n_groups):
        n_positions = np.random.randint(min_positions_per_group, max_positions_per_group + 1)
        locations = np.random.uniform(0, 50, size=(n_positions, 2))
        group_locations.append(locations)
    
    # Asigna las ubicaciones a los terminales
    for terminal_id in range(n_terminals):
        group_index = np.random.randint(n_groups)
        locations = group_locations[group_index]
        x_pos_terminal, y_pos_terminal = locations[np.random.randint(len(locations))]
        
        terminal_id_properties.append([terminal_id, x_pos_terminal, y_pos_terminal])

    print('Número de grupos: {}'.format(n_groups))
    print('Número de terminales: {}'.format(n_terminals))
    print('Número de terminales por grupo: {}'.format(max_positions_per_group))
    print('Ubicaciones de los grupos: {}'.format(group_locations))
    
    terminal_profiles_table = pd.DataFrame(terminal_id_properties, columns=['TERMINAL_ID', 'x_pos_terminal', 'y_pos_terminal'])
    
    return terminal_profiles_table

In [4]:
def get_list_terminals_within_radius(customer_profile, x_y_terminals, r):
    
    # Use numpy arrays in the following to speed up computations
    
    # Location (x,y) of customer as numpy array
    x_y_customer = customer_profile[['x_pos_cliente','y_pos_cliente']].values.astype(float)
    
    # Squared difference in coordinates between customer and terminal locations
    squared_diff_x_y = np.square(x_y_customer - x_y_terminals)
    
    # Sum along rows and compute suared root to get distance
    dist_x_y = np.sqrt(np.sum(squared_diff_x_y, axis=1))
    
    # Get the indices of terminals which are at a distance less than r
    terminales_disponibles = list(np.where(dist_x_y<r)[0])
    
    # Return the list of terminal IDs
    return terminales_disponibles

In [5]:
def extract_datetime_features(tx_datetime):
    features = {}
    datetime = pd.to_datetime(tx_datetime)
    features['year'] = datetime.year
    features['month'] = datetime.month
    features['day'] = datetime.day
    features['hour'] = datetime.hour
    features['minute'] = datetime.minute
    return features


In [6]:
def generate_transactions_table(customer_profile, start_date = "2023-01-01", nb_days = 10):
    
    customer_transactions = []
    
    random.seed(int(customer_profile.CLIENTE_ID))
    np.random.seed(int(customer_profile.CLIENTE_ID))
    
    # For all days
    for day in range(nb_days):
        
        # Random number of transactions for that day 
        nb_tx = np.random.poisson(customer_profile.promedio_tx_por_dia)
        
        # If nb_tx positive, let us generate transactions
        if nb_tx>0:
            
            for tx in range(nb_tx):
                
                # Time of transaction: Around noon, std 20000 seconds. This choice aims at simulating the fact that 
                # most transactions occur during the day.
                time_tx = int(np.random.normal(86400/2, 20000))
                
                # If transaction time between 0 and 86400, let us keep it, otherwise, let us discard it
                if (time_tx>0) and (time_tx<86400):
                    
                    # Amount is drawn from a normal distribution  
                    amount = np.random.normal(customer_profile.monto_promedio, customer_profile.desviacion_promedio)
                    
                    # If amount negative, draw from a uniform distribution
                    if amount<0:
                        amount = np.random.uniform(0,customer_profile.monto_promedio*2)
                    
                    amount=np.round(amount,decimals=2)
                    
                    if len(customer_profile.terminales_disponibles)>0:
                        
                        terminal_id = random.choice(customer_profile.terminales_disponibles)
                    
                        customer_transactions.append([time_tx+day*86400, day,
                                                      customer_profile.CLIENTE_ID, 
                                                      terminal_id, amount])
            
    customer_transactions = pd.DataFrame(customer_transactions, columns=['TIEMPO_DESDE_ULTIMA_TX_S', 'DIA_TX', 'CLIENTE_ID', 'TERMINAL_ID', 'MONTO_TX'])
    
    if len(customer_transactions)>0:
      customer_transactions['TX_DATETIME'] = pd.to_datetime(customer_transactions["TIEMPO_DESDE_ULTIMA_TX_S"], unit='s', origin=start_date)
      customer_transactions['datetime_features'] = customer_transactions['TX_DATETIME'].apply(extract_datetime_features)
      customer_transactions = pd.concat([customer_transactions.drop('datetime_features', axis=1), customer_transactions['datetime_features'].apply(pd.Series)], axis=1)
      customer_transactions = customer_transactions[['TX_DATETIME', 'CLIENTE_ID', 'TERMINAL_ID', 'MONTO_TX', 'TIEMPO_DESDE_ULTIMA_TX_S', 'DIA_TX', 'year', 'month', 'day', 'hour', 'minute']]

    
    return customer_transactions 

In [7]:
def generate_dataset(n_customers = 10000, n_terminals = 1000000, nb_days=90, start_date="2023-01-01", r=5):
    
    start_time=time.time()
    customer_profiles_table = generate_customer_profiles_table(n_customers, random_state = 0)
    print("Generación tabla perfiles de cliente: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    terminal_profiles_table = generate_terminal_profiles_table(n_terminals, random_state = 1)
    print("Generación tabla perfiles de terminal: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    x_y_terminals = terminal_profiles_table[['x_pos_terminal','y_pos_terminal']].values.astype(float)
    customer_profiles_table['terminales_disponibles'] = customer_profiles_table.apply(lambda x : get_list_terminals_within_radius(x, x_y_terminals=x_y_terminals, r=r), axis=1)
    # With Pandarallel
    #customer_profiles_table['available_terminals'] = customer_profiles_table.parallel_apply(lambda x : get_list_closest_terminals(x, x_y_terminals=x_y_terminals, r=r), axis=1)
    customer_profiles_table['numero_terminales']=customer_profiles_table.terminales_disponibles.apply(len)
    print("Asociación cliente a terminales: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    transactions_df=customer_profiles_table.groupby('CLIENTE_ID').apply(lambda x : generate_transactions_table(x.iloc[0], nb_days=nb_days)).reset_index(drop=True)
    # With Pandarallel
    #transactions_df=customer_profiles_table.groupby('CUSTOMER_ID').parallel_apply(lambda x : generate_transactions_table(x.iloc[0], nb_days=nb_days)).reset_index(drop=True)
    print("Generación de transacciones: {0:.2}s".format(time.time()-start_time))
    
    # Sort transactions chronologically
    transactions_df=transactions_df.sort_values('TX_DATETIME')
    # Reset indices, starting from 0
    transactions_df.reset_index(inplace=True,drop=True)
    transactions_df.reset_index(inplace=True)
    # TRANSACTION_ID are the dataframe indices, starting from 0
    transactions_df.rename(columns = {'index':'TRANSACTION_ID'}, inplace = True)

    #SE AÑADEN LAS COORDENADAS DEL TERMINAL PARA USARLAS EN EL ENTRENAMIENTO
    transactions_df = pd.merge(transactions_df, terminal_profiles_table[['TERMINAL_ID', 'x_pos_terminal', 'y_pos_terminal']], on='TERMINAL_ID', how='left')
    
    return (customer_profiles_table, terminal_profiles_table, transactions_df)

In [8]:
def add_frauds(customer_profiles_table, terminal_profiles_table, transactions_df):
    
    # By default, all transactions are genuine
    transactions_df['TX_FRAUD']=0
    transactions_df['TX_FRAUD_SCENARIO']=0
    
        # Scenario 1
    transactions_df.loc[transactions_df.MONTO_TX>80000, 'TX_FRAUD']=1
    transactions_df.loc[transactions_df.MONTO_TX>80000, 'TX_FRAUD_SCENARIO']=1
    nb_frauds_scenario_1=transactions_df.TX_FRAUD.sum()
    print("Number of frauds from scenario 1: "+str(nb_frauds_scenario_1))
    
    # Scenario 2
    for day in range(transactions_df.DIA_TX.max()):
        
        compromised_terminals = terminal_profiles_table.TERMINAL_ID.sample(n=2, random_state=day)
        
        compromised_transactions=transactions_df[(transactions_df.DIA_TX>=day) & 
                                                    (transactions_df.DIA_TX<day+28) & 
                                                    (transactions_df.TERMINAL_ID.isin(compromised_terminals))]
                            
        transactions_df.loc[compromised_transactions.index,'TX_FRAUD']=1
        transactions_df.loc[compromised_transactions.index,'TX_FRAUD_SCENARIO']=2
    
    nb_frauds_scenario_2=transactions_df.TX_FRAUD.sum()-nb_frauds_scenario_1
    print("Number of frauds from scenario 2: "+str(nb_frauds_scenario_2))
    
    # Scenario 3
    for day in range(transactions_df.DIA_TX.max()):
        
        compromised_customers = customer_profiles_table.CLIENTE_ID.sample(n=3, random_state=day).values
        
        compromised_transactions=transactions_df[(transactions_df.DIA_TX>=day) & 
                                                    (transactions_df.DIA_TX<day+14) & 
                                                    (transactions_df.CLIENTE_ID.isin(compromised_customers))]
        
        nb_compromised_transactions=len(compromised_transactions)
        
        
        random.seed(day)
        index_fauds = random.sample(list(compromised_transactions.index.values),k=int(nb_compromised_transactions/3))
        
        transactions_df.loc[index_fauds,'MONTO_TX']=transactions_df.loc[index_fauds,'MONTO_TX']*5
        transactions_df.loc[index_fauds,'TX_FRAUD']=1
        transactions_df.loc[index_fauds,'TX_FRAUD_SCENARIO']=3
        
                             
    nb_frauds_scenario_3=transactions_df.TX_FRAUD.sum()-nb_frauds_scenario_2-nb_frauds_scenario_1
    print("Number of frauds from scenario 3: "+str(nb_frauds_scenario_3))
    
    return transactions_df

In [9]:
def get_stats(transactions_df):
    #Number of transactions per day
    nb_tx_per_day=transactions_df.groupby(['DIA_TX'])['CLIENTE_ID'].count()
    #Number of fraudulent transactions per day
    nb_fraud_per_day=transactions_df.groupby(['DIA_TX'])['TX_FRAUD'].sum()
    #Number of fraudulent cards per day
    nb_fraudcard_per_day=transactions_df[transactions_df['TX_FRAUD']>0].groupby(['DIA_TX']).CLIENTE_ID.nunique()
    
    return (nb_tx_per_day,nb_fraud_per_day,nb_fraudcard_per_day)

In [10]:
#CLIENTES
n_customers = 100
customer_profiles_table = generate_customer_profiles_table(n_customers, random_state = 0)
#TERMINALES
n_terminals = 10000
terminal_profiles_table = generate_terminal_profiles_table(n_terminals, random_state = 0)

# We first get the geographical locations of all terminals as a numpy array
x_y_terminals = terminal_profiles_table[['x_pos_terminal','y_pos_terminal']].values.astype(float)


Número de grupos: 2
Número de terminales: 10000
Número de terminales por grupo: 5001
Ubicaciones de los grupos: [array([[35.75946832, 30.1381688 ],
       [27.24415915, 21.18273997],
       [32.29470565, 21.87936056],
       ...,
       [26.82132781, 32.30537669],
       [21.44491957, 21.78160648],
       [ 6.27020069,  8.5496323 ]]), array([[34.91529355,  1.75760331],
       [31.54734063, 21.72760293],
       [40.45356346, 49.39447898],
       ...,
       [47.94820518, 15.58021206],
       [ 8.97623255, 21.22169065],
       [34.79075987,  3.14150312]])]


In [11]:
customer_profiles_table['terminales_disponibles']=customer_profiles_table.apply(lambda x : get_list_terminals_within_radius(x, x_y_terminals=x_y_terminals, r=50), axis=1)

In [12]:
transaction_table_customer_0=generate_transactions_table(customer_profiles_table.iloc[0], 
                                                         start_date = "2023-01-01", 
                                                         nb_days = 30)

In [13]:
transactions_df=customer_profiles_table.groupby('CLIENTE_ID').apply(lambda x : generate_transactions_table(x.iloc[0], nb_days=30)).reset_index(drop=True)

In [14]:
(customer_profiles_table, terminal_profiles_table, transactions_df)=\
    generate_dataset(n_customers = 200, 
                     n_terminals = 1000, 
                     nb_days=30, 
                     start_date="2023-01-01", 
                     r=5)

Generación tabla perfiles de cliente: 0.0026s
Número de grupos: 3
Número de terminales: 1000
Número de terminales por grupo: 334
Ubicaciones de los grupos: [array([[3.60162247e+01, 5.71874087e-03],
       [1.51166286e+01, 7.33779454e+00],
       [4.61692974e+00, 9.31301057e+00],
       [1.72780364e+01, 1.98383737e+01],
       [2.69408367e+01, 2.09597257e+01],
       [3.42609750e+01, 1.02226125e+01],
       [4.39058718e+01, 1.36937966e+00],
       [3.35233755e+01, 2.08652401e+01],
       [2.79344914e+01, 7.01934693e+00],
       [9.90507445e+00, 4.00372284e+01],
       [4.84130788e+01, 1.56712089e+01],
       [3.46161308e+01, 4.38194576e+01],
       [4.47303332e+01, 4.25221057e+00],
       [1.95273916e+00, 8.49152098e+00],
       [4.39071252e+01, 4.91734169e+00],
       [2.10553813e+01, 4.78944765e+01],
       [2.66582642e+01, 3.45938557e+01],
       [1.57757816e+01, 3.43250464e+01],
       [4.17312836e+01, 9.14413867e-01],
       [3.75072157e+01, 4.94430544e+01],
       [3.74082827e+01,

In [15]:
%time transactions_df = add_frauds(customer_profiles_table, terminal_profiles_table, transactions_df)
transactions_df.TX_FRAUD.mean()
transactions_df.TX_FRAUD.sum()

Number of frauds from scenario 1: 819
Number of frauds from scenario 2: 107
Number of frauds from scenario 3: 174
CPU times: user 152 ms, sys: 60 µs, total: 152 ms
Wall time: 149 ms


1100

In [16]:
DIR_OUTPUT = "../data/"

if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)

start_date = datetime.datetime.strptime("2023-01-01", "%Y-%m-%d")

for day in range(transactions_df.DIA_TX.max()+1):
    
    transactions_day = transactions_df[transactions_df.DIA_TX==day].sort_values('TIEMPO_DESDE_ULTIMA_TX_S')
    
    date = start_date + datetime.timedelta(days=day)
    filename_output = date.strftime("%Y-%m-%d")+'.pkl'
    
    # Protocol=4 required for Google Colab
    transactions_day.to_pickle(DIR_OUTPUT+filename_output, protocol=4)