In [None]:
# Initialization: Load shared functions and simulated data 

# Load shared functions
!curl -O https://raw.githubusercontent.com/Fraud-Detection-Handbook/fraud-detection-handbook/main/Chapter_References/shared_functions.py
%run shared_functions.py

# Get simulated data from Github repository
if not os.path.exists("simulated-data-raw"):
    !git clone https://github.com/Fraud-Detection-Handbook/simulated-data-raw
        

In [None]:
dir_input='./simulated-data-raw/data/' 

begin_date = "2023-12-01"
end_date = "2024-02-28"

print("Load  files")
%time transactions_df = read_from_files(dir_input, being_date, end_date)
print("{0} transactions loaded, containing {1} fraudulent transactions"\
      .format(len(transactions_df),transactions_df.tx_fraud.sum()))

In [None]:
transactions_df.head()

In [None]:
def is_weekend(tx_datetime):

    # Transform date into weekday (0 is Monday, 6 is Sunday)
    weekend = tx_datetime.weekday()
    # Binary value: 0 weekday, 1 if weekend
    is_weekend = weekday>=5

    return int(is_weekend)

In [None]:
%time transactions_df['tx_during_weekend'] = transaction_df.tx_datetime\
                                            .apply(is_weekend)

In [None]:
def is_night(tx_datetime):

    # Get the hour of the Transaction
    tx_hour = tx_datetime.hour
    # Binary Value: 1 if hour is less than 6, an 0 otherwise
    is_night = tx_hour<=6

    return int(is_night)

In [None]:
%time transactions_df['tx_during_night'] = transactions_df.tx_datetime\
                                            .apply(is_night)

In [None]:
transactions_df[transactions_df.tx_time_days>=30]

In [None]:
def get_customer_spending_behavior_features(customer_transactions, windows_size_in_days=[1,7,30]):

    # Order Transactions Chronologically
    customer_transactions = customer_transactions.sort_values('tx_datetime')

    # The transaction date and time is set as the index, which will allow the use of the rolling function
    customer_transactions.index = customer_transactions.tx_datetime

    # For each window size
    for window_size in windows_size_in_days:

        # Compute the sum of the transaction amounts and the number of transactions for the given window size
        sum_amount_tx_window = customer_transactions['tx_amount'].rolling(str(window_size)+'d').sum()
        nb_tx_window = customer_transactions['tx_amount'].rolling(str(window_size)+'d').count()

        # Compute the average transaction amount for the given window size
        # nb_tx_window is always >0 since current transaction is always included
        avg_amount_tx_window = sum_amount_tx_window/nb_tx_window

        # Save feature values
        customer_transactions['customer_id_nb_tx_' + str(window_size) + 'day_window'] = list(nb_tx_window)
        customer_transactions['customer_id_avg_amount_' + str(window_size) + 'day_window'] = list(avg_amount_tx_window)

    # Reindex according to transaction IDs
    customer_transactions.index = customer_transactions.transaction_id

    # And return the dataframe with the new features
    return customer_transactions 
    

In [None]:
spending_behavior_customer_0 = get_customer_spending_behavior_features(transactions_df[transactions_df.CUSTOMER_ID == 0])
spending_behavior_customer_0

In [None]:
%time transactions_df = transactions_df.groupby('customer_id').apply(lambda x: get_customer_spending_behavior_features(x, windows_size_in_days=[1,7,30]))
transactions_df = transactions_df.sort_values('tx_datetime').reset_index(drop = True)

In [None]:
transactions_df

In [None]:
nb_fraud_window = nb_fraud_delay_window - nb_fraud_delay
nb_tx_window = nb_tx_delay_window - nb_tx_delay

In [None]:
risk_window = nb_fraud_window/nb_tx_window

In [None]:
def get_count_risk_rolling_window(terminal_transactions, delay_period=7, windows_size_in_days=[1,7,30], feature="terminal_id"):
    
    terminal_transactions = terminal_transactions.sort_values('tx_datetime')
    
    terminal_transactions.index = terminal_transactions.tx_datetime
    
    nb_fraud_delay = terminal_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').sum()
    nb_tx_delay = terminal_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').count()
    
    for window_size in windows_size_in_days:
    
        nb_fraud_delay_window = terminal_transactions['TX_FRAUD'].rolling(str(delay_period+window_size)+'d').sum()
        nb_tx_delay_window=terminal_transactions['TX_FRAUD'].rolling(str(delay_period+window_size)+'d').count()
    
        nb_fraud_window = nb_fraud_delay_window-nb_fraud_delay
        nb_tx_window = nb_tx_delay_window-nb_tx_delay
    
        risk_window=nb_fraud_window/nb_tx_window
        
        terminal_transactions[feature+'_nb_tx_'+str(window_size)+'day_window'] = list(nb_tx_window)
        terminal_transactions[feature+'_risk_'+str(window_size)+'day_window'] = list(risk_window)
        
    terminal_transactions.index = terminal_transactions.transaction_id
    
    # Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0) 
    terminal_transactions.fillna(0,inplace = True)
    
    return terminal_transactions

In [None]:
transactions_df[transactions_df.tx_fraud == 1]

In [None]:
# Get the first terminal ID that contains frauds
transactions_df[transactions_df.tx_fraud == 0].terminal_id[0]

In [None]:
get_count_risk_rolling_window(transactions_df[transactions_df.terminal_id == 3059], delay_period=7, windows_size_in_days=[1,7,30])

In [None]:
%time transactions_df=transactions_df.groupby('terminal_id').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature='terminal_id'))
transactions_df=transactions_df.sort_values('tx_datetime').reset_index(drop=True)

In [None]:
transactions_df

In [None]:
dir_output = "./simulated-data-transformed/"

if not os.path.exists(dir_output):
    os.makedirs(dir_output)

start_date = datetime.datetime.strptime("2018-04-01", "%Y-%M-%D")

for day in range(transactions_df.tx_time_days.max()+1):
    
    transactions_day = transactions_df[transactions_df.tx_time_days==day].sort_values('tx_time_secondsS')
    
    date = start_date + datetime.timedelta(days=day)
    filename_output = date.strftime("%Y-%M-%D")+'.pkl'
    
    # Protocol=4 required for Google Colab
    transactions_day.to_pickle(dir_output+filename_output, protocol=4)