In [1]:
import pandas as pd
import numpy as np
from haversine import haversine

https://www.kaggle.com/datasets/kartik2112/fraud-detection?resource=download

In [2]:
df = pd.read_csv("fraudTrain.csv")
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [3]:
df1 = df.copy()
df1 = df1.drop(["Unnamed: 0", "first", "last", "street", "trans_num", 
          "cc_num", "zip", "job"], axis=1)

In [4]:
df1["trans_date_trans_time"] = pd.to_datetime(df1["trans_date_trans_time"])

df1["trans_hour"] = df1["trans_date_trans_time"].dt.hour
df1["trans_day"] = df1["trans_date_trans_time"].dt.day
df1["trans_month"] = df1["trans_date_trans_time"].dt.month
df1["is_weekend"] = df1["trans_date_trans_time"].dt.weekday >= 5

In [5]:
df1["dob"] = pd.to_datetime(df1["dob"])
df1["age"] = (df1["trans_date_trans_time"] - df1["dob"]).dt.days // 365

In [6]:
df1["log_amt"] = np.log1p(df1["amt"])

In [7]:
def calc_distance(row):
    return haversine(
        (row["lat"], row["long"]),
        (row["merch_lat"], row["merch_long"])
    )

df1["distance_km"] = df1.apply(calc_distance, axis=1)

In [8]:
df1["city_pop_log"] = np.log1p(df1["city_pop"])

In [9]:
df1 = pd.get_dummies(df1, columns=["category", "gender"], drop_first=True)

In [10]:
merchant_fraud_rate = df1.groupby("merchant")["is_fraud"].mean()
df1["merchant_risk"] = df1["merchant"].map(merchant_fraud_rate)

In [11]:
df1 = df1.drop(columns=[
    "trans_date_trans_time", "dob",
    "lat", "long", "merch_lat", "merch_long",
    "merchant","unix_time"])

In [12]:
for i in df1.columns:
    if df1[i].dtypes == "bool":
        df1[i] = df1[i].astype(int)

In [13]:
df1.head()

Unnamed: 0,amt,city,state,city_pop,is_fraud,trans_hour,trans_day,trans_month,is_weekend,age,...,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,merchant_risk
0,4.97,Moravian Falls,NC,3495,0,0,1,1,0,30,...,0,0,1,0,0,0,0,0,0,0.014207
1,107.23,Orient,WA,149,0,0,1,1,0,40,...,0,0,0,0,0,0,0,0,0,0.010787
2,220.11,Malad City,ID,4154,0,0,1,1,0,56,...,0,0,0,0,0,0,0,0,1,0.002111
3,45.0,Boulder,MT,1939,0,0,1,1,0,52,...,0,0,0,0,0,0,0,0,1,0.003444
4,41.96,Doe Hill,VA,99,0,0,1,1,0,32,...,0,0,0,1,0,0,0,0,1,0.003769


In [14]:
df1 = df1.drop(["city", "state", "city_pop"], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [54]:
import numpy as np
import pandas as pd
from haversine import haversine

def preprocess_transactions(df):
    df = df.copy()  # avoid modifying original

    # Drop unused columns
    df = df.drop([
        "Unnamed: 0", "first", "last", "street",
        "trans_num", "cc_num", "zip", "job"
    ], axis=1)

    # Datetime features
    df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
    df["trans_hour"] = df["trans_date_trans_time"].dt.hour
    df["trans_day"] = df["trans_date_trans_time"].dt.day
    df["trans_month"] = df["trans_date_trans_time"].dt.month
    df["is_weekend"] = (df["trans_date_trans_time"].dt.weekday >= 5).astype(int)

    # Age
    df["dob"] = pd.to_datetime(df["dob"])
    df["age"] = (df["trans_date_trans_time"] - df["dob"]).dt.days // 365

    # Amount transformation
    df["log_amt"] = np.log1p(df["amt"])

    # Distance calculation
    def calc_distance(row):
        return haversine(
            (row["lat"], row["long"]),
            (row["merch_lat"], row["merch_long"])
        )

    df["distance_km"] = df.apply(calc_distance, axis=1)

    # Population feature
    df["city_pop_log"] = np.log1p(df["city_pop"])

    # One-hot encoding
    df = pd.get_dummies(df, columns=["category", "gender"], drop_first=True)

    # Merchant risk
    merchant_fraud_rate = df.groupby("merchant")["is_fraud"].mean()
    df["merchant_risk"] = df["merchant"].map(merchant_fraud_rate)

    # Drop remaining unused columns
    df = df.drop(columns=[
        "trans_date_trans_time", "dob",
        "lat", "long", "merch_lat", "merch_long",
        "merchant", "unix_time"
    ])

    # Convert bool → int
    for col in df.select_dtypes(include="bool"):
        df[col] = df[col].astype(int)

    df = df.drop(["city", "state", "city_pop"], axis=1)

    return df


In [46]:
df_train = pd.read_csv("fraudTrain.csv")
df_test = pd.read_csv("fraudTest.csv")

In [55]:
df_train_processed = preprocess_transactions(df_train)
df_test_processed  = preprocess_transactions(df_test)

In [56]:
df_train_processed.dtypes

amt                        float64
is_fraud                     int64
trans_hour                   int32
trans_day                    int32
trans_month                  int32
is_weekend                   int64
age                          int64
log_amt                    float64
distance_km                float64
city_pop_log               float64
category_food_dining         int64
category_gas_transport       int64
category_grocery_net         int64
category_grocery_pos         int64
category_health_fitness      int64
category_home                int64
category_kids_pets           int64
category_misc_net            int64
category_misc_pos            int64
category_personal_care       int64
category_shopping_net        int64
category_shopping_pos        int64
category_travel              int64
gender_M                     int64
merchant_risk              float64
dtype: object

In [16]:
df1.columns

Index(['amt', 'is_fraud', 'trans_hour', 'trans_day', 'trans_month',
       'is_weekend', 'age', 'log_amt', 'distance_km', 'city_pop_log',
       'category_food_dining', 'category_gas_transport',
       'category_grocery_net', 'category_grocery_pos',
       'category_health_fitness', 'category_home', 'category_kids_pets',
       'category_misc_net', 'category_misc_pos', 'category_personal_care',
       'category_shopping_net', 'category_shopping_pos', 'category_travel',
       'gender_M', 'merchant_risk'],
      dtype='object')

In [57]:
FEATURES = ['amt', 'trans_hour', 'trans_day', 'trans_month',
       'is_weekend', 'age', 'log_amt', 'distance_km', 'city_pop_log',
       'category_food_dining', 'category_gas_transport',
       'category_grocery_net', 'category_grocery_pos',
       'category_health_fitness', 'category_home', 'category_kids_pets',
       'category_misc_net', 'category_misc_pos', 'category_personal_care',
       'category_shopping_net', 'category_shopping_pos', 'category_travel',
       'gender_M', 'merchant_risk']
X_train = df_train_processed.drop(columns=["is_fraud"]).values
y_train = df_train_processed["is_fraud"].values

X_val   = df_test_processed.drop(columns=["is_fraud"]).values
y_val   = df_test_processed["is_fraud"].values

In [58]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [19]:
import tensorflow as tf
from tensorflow.keras import layers, models


In [59]:
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)


In [60]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = dict(enumerate(class_weights))


In [61]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    class_weight=class_weight_dict,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_auc',
            patience=3,
            restore_best_weights=True
        )
    ]
)


Epoch 1/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.8787 - auc: 0.9563 - loss: 0.2724 - precision: 0.0410 - recall: 0.8913 - val_accuracy: 0.9675 - val_auc: 0.9925 - val_loss: 0.1066 - val_precision: 0.1010 - val_recall: 0.9389
Epoch 2/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9483 - auc: 0.9891 - loss: 0.1333 - precision: 0.0963 - recall: 0.9454 - val_accuracy: 0.9640 - val_auc: 0.9953 - val_loss: 0.0861 - val_precision: 0.0947 - val_recall: 0.9720
Epoch 3/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9564 - auc: 0.9929 - loss: 0.1057 - precision: 0.1136 - recall: 0.9599 - val_accuracy: 0.9653 - val_auc: 0.9967 - val_loss: 0.0802 - val_precision: 0.0981 - val_recall: 0.9767
Epoch 4/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9615 - auc: 0.9943 - loss: 0.0927 - precision: 0.1272 - recall: 

In [69]:
class ResNeXtDNNBlock(layers.Layer):
    def __init__(self, hidden_dim=128, cardinality=8):
        super().__init__()

        self.branches = [
            models.Sequential([
                layers.Dense(hidden_dim, activation='relu'),
                layers.Dense(hidden_dim)
            ])
            for _ in range(cardinality)
        ]

        self.activation = layers.ReLU()

    def call(self, x):
        branch_outputs = [branch(x) for branch in self.branches]
        aggregated = tf.add_n(branch_outputs)
        return self.activation(aggregated + x)

In [74]:
inputs = layers.Input(shape=(X_train.shape[1],))

x = layers.Dense(128)(inputs)   # projection layer
x = ResNeXtDNNBlock(128, cardinality=8)(x)
x = ResNeXtDNNBlock(128, cardinality=8)(x)

x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs, outputs)


In [75]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

In [76]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=1024,
    class_weight=class_weight_dict,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_auc',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.9313 - auc: 0.9820 - loss: 0.1740 - precision: 0.0727 - recall: 0.9250 - val_accuracy: 0.9685 - val_auc: 0.9933 - val_loss: 0.0931 - val_precision: 0.1045 - val_recall: 0.9469
Epoch 2/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - accuracy: 0.9549 - auc: 0.9928 - loss: 0.1059 - precision: 0.1104 - recall: 0.9619 - val_accuracy: 0.9703 - val_auc: 0.9916 - val_loss: 0.0650 - val_precision: 0.1101 - val_recall: 0.9469
Epoch 3/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.9594 - auc: 0.9941 - loss: 0.0941 - precision: 0.1216 - recall: 0.9664 - val_accuracy: 0.9794 - val_auc: 0.9955 - val_loss: 0.0606 - val_precision: 0.1525 - val_recall: 0.9529
Epoch 4/20
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.9603 - auc: 0.9944 - loss: 0.0912 - precision: 0.1243 - 