In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam as Adam
from tensorflow.keras.regularizers import L2
from sklearn.metrics import f1_score
from scipy.stats import norm

# TRAINING SET

In [2]:
# Retrieve the training data
training_set = pd.read_csv('./train_ScotiaDSD.csv')
training_set.head()

Unnamed: 0,EVENT_MONTH,EVENT_DAY_OF_WEEK,EVENT_TIME,TRANSACTION_ID,FRAUD_FLAG,AVAIL_CRDT,AMOUNT,CREDIT_LIMIT,CARD_NOT_PRESENT,USER_AGENT,...,COUNT_LOW_AMT_PAST_30DAY,COUNT_MANUAL_ENTRY_PAST_30DAY,COUNT_PHONE_ORDER_PAST_30DAY,COUNT_PURCHASE_EXCLUDING_GAS_PAST_30DAY,COUNT_PLANNED_PAST_30DAY,COUNT_SWIPE_PAST_30DAY,COUNT_TRAVEL_AND_ENTERTAINMENT_PAST_30DAY,COUNT_WEEKEND_PAST_30DAY,PREV_M_INFLATION,PREV_M_UNEMP_RATE
0,2,4,14:16,a316accb,1.0,537.1,11.7,29200.0,1,,...,16,39,73,60,13,22,4,14,1.1,5.7
1,2,4,12:51,8352728b,0.0,20371.88,96.35,30700.0,1,Mozilla/5.0 (iPhone; CPU OS 16_0 like Mac OS X...,...,1,50,93,93,0,0,0,0,1.1,5.7
2,2,4,15:26,d50120e1,0.0,15628.17,193.72,19500.0,0,,...,9,3,42,42,2,33,0,12,1.1,5.7
3,2,4,16:19,e96e2139,0.0,12913.98,47.15,18400.0,0,,...,4,7,75,72,12,61,0,20,1.1,5.7
4,2,4,8:15:,32502a99,0.0,26779.35,121.88,29200.0,1,Mozilla/5.0 (iPhone; CPU OS 16_0 like Mac OS X...,...,11,6,41,42,10,30,4,16,1.1,5.7


In [3]:
# dict of mean and std for various fields
def name_from_field(field):
    start = field.find('_')
    end = field[start+1:].find('PAST')
    if (end == -1):
        return field[start+1:]
    return field[start+1:start+end]

def norm_pdf(x, mean, sd):
    prob_density = (np.pi*sd) * np.exp(-0.5*((x-mean)/sd)**2)
    return prob_density

# generate density value based on guassian p.d.f.
def density(x):
    flags = x.filter(regex='^FLAG_')
    flags = flags.loc[flags == 1]
    density = 1
    for item, value in flags.items():
        key = item[5:]
        matching = x.filter(regex=key)
        try:
            mean, std = x.loc[f'STD_{key}_PAST_30DAY'], x.loc[f'MEAN_{key}_PAST_30DAY']
            density = density * norm_pdf(x['AMOUNT'], x[mean], x[std])
        except:
            pass
    return density

density(training_set.iloc[0])

1

In [4]:
# get the flags that have corresponding STD values, MEAN values, etc
stds = training_set.filter(regex='^STD_.*_30DAY$').columns
std_flags = ['FLAG_' + name_from_field(x) for x in stds]
means = training_set.filter(regex='^MEAN_.*_30DAY$').columns
mean_flags = ['FLAG_' + name_from_field(x) for x in means]
maxs = training_set.filter(regex='^MAX_.*_30DAY$').columns
max_flags = ['FLAG_' + name_from_field(x) for x in maxs]
sums = training_set.filter(regex='^SUM_.*_30DAY$').columns
sum_flags = ['FLAG_' + name_from_field(x) for x in sums]
counts = training_set.filter(regex='^COUNT_.*_30DAY$').columns
count_flags = ['FLAG_' + name_from_field(x) for x in counts]

stds = training_set.filter(regex='^STD_.*_7DAY$').columns
std_flags_7day = ['FLAG_' + name_from_field(x) for x in stds]
means = training_set.filter(regex='^MEAN_.*_7DAY$').columns
mean_flags_7day = ['FLAG_' + name_from_field(x) for x in means]
maxs = training_set.filter(regex='^MAX_.*_7DAY$').columns
max_flags_7day = ['FLAG_' + name_from_field(x) for x in maxs]
sums = training_set.filter(regex='^SUM_.*_7DAY$').columns
sum_flags_7day = ['FLAG_' + name_from_field(x) for x in sums]
counts = training_set.filter(regex='^COUNT_.*_7DAY$').columns
count_flags_7day = ['FLAG_' + name_from_field(x) for x in counts]



In [5]:
# create custom features such as density (from normal dist), 1-value/max
def generate_values(x):
    # get the positive flags for row x
    allflags = x.filter(regex='^FLAG_')
    flags = allflags.loc[allflags == 1]
    flags = flags.index
    
    #initialize custom features
    density = 1
    count = 1
    dist_from_min = 0
    density_7day = 1
    count_7day = 0
    dist_from_min_7day = 0
    
    
    # calculate features
    for flag in flags:
        key = name_from_field(flag)
        
        if (flag in std_flags and flag in mean_flags):
            mean, std = x.loc[f'STD_{key}_PAST_30DAY'], x.loc[f'MEAN_{key}_PAST_30DAY']
            if (std != 0):
                density *= norm_pdf(x['AMOUNT'], mean, std)
        if (flag in count_flags):
            count *= x.loc[f'COUNT_{key}_PAST_30DAY']
        if (flag in max_flags):
            if (x.loc[f'MAX_{key}_PAST_30DAY'] != 0):
                dist_from_min += 1-x.loc['AMOUNT']/x.loc[f'MAX_{key}_PAST_30DAY']
        
        if (flag in std_flags_7day and flag in mean_flags_7day):
            mean, std = x.loc[f'STD_{key}_PAST_7DAY'], x.loc[f'MEAN_{key}_PAST_7DAY']
            if (std != 0):
                density_7day *= norm_pdf(x['AMOUNT'], mean, std)
        if (flag in count_flags_7day):
            count_7day += x.loc[f'COUNT_{key}_PAST_7DAY']
        if (flag in max_flags_7day):
            if (x.loc[f'MAX_{key}_PAST_7DAY'] != 0):
                dist_from_min_7day += 1-x.loc['AMOUNT']/x.loc[f'MAX_{key}_PAST_7DAY']

    # return features, as well as squared versions
    return pd.concat([
        x,
        np.square(x),
        pd.Series(
            [density**2,
            dist_from_min**2,
            density_7day**2,
            count_7day,
            dist_from_min_7day**2,
            density,
            dist_from_min,
            density_7day,
            dist_from_min_7day],
    )])

In [6]:
def df_to_input(df):
    allflags = df.filter(regex='^FLAG_')
    
    # drop irrelevant fields
    drop_array = ['FRAUD_FLAG', 'EVENT_MONTH', 'EVENT_DAY_OF_WEEK', 'EVENT_TIME']
    df = df.drop(drop_array, axis=1)
    
    # remove all non numeric types
    types = df.dtypes
    types = types != object
    types = types.values
    numeric = df.loc[:, types]
        
    # generate the desired fields
    numeric = numeric.apply(lambda row: generate_values(row), axis = 1)
    
    # normalize columns
    normalized = numeric.apply(lambda col: (col - col.mean()) / col.std())
    
    # extract input and output values
    X = normalized.values
    return X

def generate_training_data(df):
    fraudulent = df.loc[df['FRAUD_FLAG'] == 1]
    df.append([fraudulent]*50) # over sample fraudulent rows
    
    X = df_to_input(df)
    Y = df['FRAUD_FLAG']
    # split 60% training, 40% cross validation set
    cv_cutoff = len(X)*6//10
    return (X[:cv_cutoff], Y[:cv_cutoff], X[cv_cutoff:], Y[cv_cutoff:])

In [7]:
X, Y, X_cv, Y_cv = generate_training_data(training_set)

(53538, 345)
[-1.29455328e+00 -1.78210010e-01  1.30651546e+00  1.66832421e+00
 -2.12031591e-01 -6.45275646e-02  4.81946798e+00 -6.80245281e-02
 -5.16076910e-01 -1.46146067e-01 -8.12912787e-01 -1.38859487e-01
 -2.75573737e-01 -2.62128932e-01 -3.37425686e-01 -3.59691331e-01
 -3.42042879e-01 -3.92432927e-01 -1.58546549e-01 -4.34316157e-02
 -6.44707652e-01  2.63531704e+00  1.65596211e+00  3.37425686e-01
 -3.51352262e-01 -7.05556122e-02 -1.65175492e+00 -1.85715825e-01
 -1.95256395e-01 -5.80254385e-01  1.12373083e-01 -2.64190502e-01
  5.40141704e+00  1.86373408e-01 -2.99728008e-01  1.09900336e+00
 -2.26573055e-01  1.09919159e+00 -5.15994893e-02 -3.84231624e-02
  2.05318584e-01  7.74169210e-02  5.23298929e-01  5.70165015e+00
  1.74696249e+00 -2.25427769e-01  1.12262651e+01 -4.89320105e-02
 -2.63970112e-01  5.06946358e+00  3.25154912e+00 -2.48881342e-01
  2.58337300e+00  3.12089621e+00  3.28863145e+00  5.86434589e+00
  1.04597722e+01 -1.11911300e-01 -2.54013381e-01  2.20694913e+00
  2.50522227

In [8]:
input_size = X.shape[1]
reg = 0.001
model = Sequential([
    Input((input_size)),
    Dense(units = 128, activation = 'relu', kernel_regularizer=L2(reg)),
    Dense(units = 64, activation = 'relu', kernel_regularizer=L2(reg)),
    Dense(units = 32, activation = 'relu', kernel_regularizer=L2(reg)),
    Dense(units = 1, activation = 'sigmoid', kernel_regularizer=L2(reg))
])
model.compile(optimizer=Adam(learning_rate=0.001,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-08), loss=BinaryCrossentropy())
model.fit(X,Y,epochs=20, verbose=1)
model.summary()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               44288     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 54,657
Trainable params: 54,657
Non-trainable params: 0
___________________

In [9]:
# classify as anomaly if model outputs greater than 0.5
def predict(x):
    y = model.predict(x)
    y = y > 0.25
    return y.reshape((-1))

In [10]:
ypred = model.predict(X_cv)
print(np.mean(ypred[Y_cv==1]))
print(np.mean(ypred[Y_cv==0]))

0.27741957
0.014353339


In [491]:
print("f1 for training set:", f1_score(predict(X), Y))
print("f1 for cross validation set:", f1_score(predict(X_cv), Y_cv))

f1 for training set: 0.5288729589804859
f1 for cross validation set: 0.5250875145857643


In [11]:
# predict on test set
test_set = pd.read_csv('./test_ScotiaDSD.csv')
test_set['TRANSACTION_ID']

0        eed6f943
1        f5cd3ab1
2        9dc2fd8e
3        ce5560f4
4        ece4a7d1
           ...   
22302    1a32b092
22303    4602156e
22304    272ff938
22305    0d2006e0
22306    7744077e
Name: TRANSACTION_ID, Length: 22307, dtype: object

# TEST SET

In [12]:
test_input = df_to_input(test_set)

In [13]:
probabilities = model.predict(test_input).reshape(-1)
predictions = predict(test_input).reshape(-1).astype(int)
results = pd.DataFrame({'TRANSACTION_ID': test_set['TRANSACTION_ID'].to_numpy(),
                        'PROBABILITY': probabilities, 
                        'PREDICTION': predictions})
print(results.head())
results.to_csv('datarama_prediction.csv', index=False)

  TRANSACTION_ID  PROBABILITY  PREDICTION
0       eed6f943     0.001623           0
1       f5cd3ab1     0.001842           0
2       9dc2fd8e     0.004879           0
3       ce5560f4     0.000543           0
4       ece4a7d1     0.036676           0
