In [22]:
import pandas as pd
import numpy as np
from haversine import haversine

https://www.kaggle.com/datasets/kartik2112/fraud-detection?resource=download

In [23]:
df = pd.read_csv("fraudTrain.csv")
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [24]:
df1 = df.copy()
df1 = df1.drop(["Unnamed: 0", "first", "last", "street", "trans_num", 
          "cc_num", "zip", "job"], axis=1)

In [25]:
df1["trans_date_trans_time"] = pd.to_datetime(df1["trans_date_trans_time"])

df1["trans_hour"] = df1["trans_date_trans_time"].dt.hour
df1["trans_day"] = df1["trans_date_trans_time"].dt.day
df1["trans_month"] = df1["trans_date_trans_time"].dt.month
df1["is_weekend"] = df1["trans_date_trans_time"].dt.weekday >= 5

In [26]:
df1["dob"] = pd.to_datetime(df1["dob"])
df1["age"] = (df1["trans_date_trans_time"] - df1["dob"]).dt.days // 365

In [27]:
df1["log_amt"] = np.log1p(df1["amt"])

In [28]:
def calc_distance(row):
    return haversine(
        (row["lat"], row["long"]),
        (row["merch_lat"], row["merch_long"])
    )

df1["distance_km"] = df1.apply(calc_distance, axis=1)

In [29]:
df1["city_pop_log"] = np.log1p(df1["city_pop"])

In [30]:
df1 = pd.get_dummies(df1, columns=["category", "gender"], drop_first=True)

In [31]:
merchant_fraud_rate = df1.groupby("merchant")["is_fraud"].mean()
df1["merchant_risk"] = df1["merchant"].map(merchant_fraud_rate)

In [32]:
df1 = df1.drop(columns=[
    "trans_date_trans_time", "dob",
    "lat", "long", "merch_lat", "merch_long",
    "merchant","unix_time"])

In [39]:
for i in df1.columns:
    if df1[i].dtypes == "bool":
        df1[i] = df1[i].astype(int)

In [40]:
df1.head()

Unnamed: 0,amt,city,state,city_pop,is_fraud,trans_hour,trans_day,trans_month,is_weekend,age,...,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,merchant_risk
0,4.97,Moravian Falls,NC,3495,0,0,1,1,0,30,...,0,0,1,0,0,0,0,0,0,0.014207
1,107.23,Orient,WA,149,0,0,1,1,0,40,...,0,0,0,0,0,0,0,0,0,0.010787
2,220.11,Malad City,ID,4154,0,0,1,1,0,56,...,0,0,0,0,0,0,0,0,1,0.002111
3,45.0,Boulder,MT,1939,0,0,1,1,0,52,...,0,0,0,0,0,0,0,0,1,0.003444
4,41.96,Doe Hill,VA,99,0,0,1,1,0,32,...,0,0,0,1,0,0,0,0,1,0.003769


In [42]:
df1 = df1.drop(["city", "state", "city_pop"], axis=1)

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [46]:
df1.columns

Index(['amt', 'is_fraud', 'trans_hour', 'trans_day', 'trans_month',
       'is_weekend', 'age', 'log_amt', 'distance_km', 'city_pop_log',
       'category_food_dining', 'category_gas_transport',
       'category_grocery_net', 'category_grocery_pos',
       'category_health_fitness', 'category_home', 'category_kids_pets',
       'category_misc_net', 'category_misc_pos', 'category_personal_care',
       'category_shopping_net', 'category_shopping_pos', 'category_travel',
       'gender_M', 'merchant_risk'],
      dtype='object')

In [48]:
FEATURES = ['amt', 'trans_hour', 'trans_day', 'trans_month',
       'is_weekend', 'age', 'log_amt', 'distance_km', 'city_pop_log',
       'category_food_dining', 'category_gas_transport',
       'category_grocery_net', 'category_grocery_pos',
       'category_health_fitness', 'category_home', 'category_kids_pets',
       'category_misc_net', 'category_misc_pos', 'category_personal_care',
       'category_shopping_net', 'category_shopping_pos', 'category_travel',
       'gender_M', 'merchant_risk']
target = "is_fraud"
X = df1[FEATURES].values
y = df1[target].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [49]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [50]:
import tensorflow as tf
from tensorflow.keras import layers, models


In [51]:
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)


In [52]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = dict(enumerate(class_weights))


In [53]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    class_weight=class_weight_dict,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_auc',
            patience=3,
            restore_best_weights=True
        )
    ]
)


Epoch 1/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.8622 - auc: 0.9469 - loss: 0.3003 - precision: 0.0359 - recall: 0.8824 - val_accuracy: 0.9573 - val_auc: 0.9892 - val_loss: 0.1463 - val_precision: 0.1140 - val_recall: 0.9420
Epoch 2/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9440 - auc: 0.9860 - loss: 0.1504 - precision: 0.0889 - recall: 0.9382 - val_accuracy: 0.9619 - val_auc: 0.9949 - val_loss: 0.1016 - val_precision: 0.1286 - val_recall: 0.9667
Epoch 3/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9535 - auc: 0.9914 - loss: 0.1168 - precision: 0.1067 - recall: 0.9545 - val_accuracy: 0.9673 - val_auc: 0.9960 - val_loss: 0.0841 - val_precision: 0.1475 - val_recall: 0.9727
Epoch 4/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9590 - auc: 0.9935 - loss: 0.1007 - precision: 0.1200 - recall: 

In [54]:
class ResNeXtDNNBlock(layers.Layer):
    def __init__(self, hidden_dim=64, cardinality=8):
        super().__init__()

        self.branches = [
            models.Sequential([
                layers.Dense(hidden_dim, activation='relu'),
                layers.Dense(hidden_dim)
            ])
            for _ in range(cardinality)
        ]

        self.activation = layers.ReLU()

    def call(self, x):
        branch_outputs = [branch(x) for branch in self.branches]
        aggregated = tf.add_n(branch_outputs)
        return self.activation(aggregated + x)

In [55]:
inputs = layers.Input(shape=(X_train.shape[1],))

x = layers.Dense(64)(inputs)   # projection layer
x = ResNeXtDNNBlock(64, cardinality=8)(x)
x = ResNeXtDNNBlock(64, cardinality=8)(x)

x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs, outputs)





In [56]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

In [57]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    class_weight=class_weight_dict,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_auc',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - auc: 0.9798 - loss: 0.1819 - precision: 0.0688 - recall: 0.9206 - val_auc: 0.9926 - val_loss: 0.0781 - val_precision: 0.1856 - val_recall: 0.9127
Epoch 2/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - auc: 0.9925 - loss: 0.1080 - precision: 0.1075 - recall: 0.9589 - val_auc: 0.9950 - val_loss: 0.0795 - val_precision: 0.1656 - val_recall: 0.9514
Epoch 3/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - auc: 0.9939 - loss: 0.0949 - precision: 0.1201 - recall: 0.9670 - val_auc: 0.9956 - val_loss: 0.0819 - val_precision: 0.1654 - val_recall: 0.9634
Epoch 4/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - auc: 0.9952 - loss: 0.0851 - precision: 0.1361 - recall: 0.9670 - val_auc: 0.9950 - val_loss: 0.0923 - val_precision: 0.1230 - val_recall: 0.9780
Epoch 5/20
[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━