In [4]:
# imports
import kagglehub
import numpy as np
import pandas as pd

# scikit-learn stuff
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


# download the dataset
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


path to dataset files: /home/tygo/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3


In [5]:
import pandas as pd

# Trying to get a better sense of the data
csv_path = path + '/creditcard.csv'
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
df.describe()

In [6]:
# split the dataset into training and test, use SMOTE since the dataset is imbalanced

X = df.iloc[:, :30] # all rows, first 30 columns
y = df.iloc[:, 30]  # all rows, last column

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0
)

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
# define the model and fit it on the training data
from sklearn.linear_model import RidgeClassifier

rc = RidgeClassifier(alpha=1, solver='sag') # decided on saga for the large number of features and samples
clf = rc.fit(X_train_res, y_train_res)
clf.score(X_test, y_test)


So, this score is good, but too good to be true without much work. I used SMOTE to balance the dataset, which was a good choice I believe, but the choice of model could be slightly better. Models like random forest, XGBoost, or a NN may capture non-linear patterns better. Additionally, accuracy is not the only measure of performance, especially on such an imbalanced dataset. 

The next step is to use better evaluation metrics. Going to fit two other models, and display more informative metrics like precision, recall, and F1-score for the minority class.

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

Looking at the results, the model actually fails about 25% of the fraudulent cases. With only a 35% precision, which could be costly in something fragile like fraud detection. 

Trying a random forest classifier:

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_res, y_train_res)
rf_clf.score(X_test, y_test)

In [None]:
y_rf_pred = rf_clf.predict(X_test)
print(classification_report(y_test, y_rf_pred))

Pretty good results,  can see that 90% of cases that were classified as fraud were indeed fraudulent cases, but a slightly problematic recall of 82%. Now I want to try a neural net trained on the SMOTE dataset. 

In [24]:
# had to use tf-nightly since this venv is on python 3.13
import keras
import tensorflow as tf
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from sklearn.metrics import classification_report

In [22]:
n_inputs = X_train_res.shape[1]

smote_model = Sequential([
    Dense(128, input_shape=(n_inputs,), activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

smote_model.summary()

smote_model.compile(optimizer=Adam(learning_rate=0.0005),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

In [None]:
smote_model.fit(X_train_res, y_train_res,
                epochs=20,
                batch_size=64,
                validation_split=0.2,
                shuffle=True,
                verbose=1)

Epoch 1/20
[1m1660/4976[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m11s[0m 3ms/step - accuracy: 0.6192 - loss: 0.6778

KeyboardInterrupt: 

In [None]:
y_pred_prob = smote_model.predict(X_test, verbose=1)
y_pred = (y_pred_prob > 0.5).astype(int)

In [None]:
print(classification_report(y_test, y_pred))

In [12]:
# Trying to scale the data to improve performance
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

smote_model.fit(X_train_res_scaled, y_train_res, 
                epochs=20,
                batch_size=32,
                validation_split=0.2,
                verbose=1)

Epoch 1/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 3ms/step - accuracy: 0.9757 - loss: 0.0673 - val_accuracy: 0.9964 - val_loss: 0.0111
Epoch 2/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9931 - loss: 0.0201 - val_accuracy: 0.9985 - val_loss: 0.0054
Epoch 3/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9949 - loss: 0.0153 - val_accuracy: 0.9993 - val_loss: 0.0031
Epoch 4/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9961 - loss: 0.0127 - val_accuracy: 0.9998 - val_loss: 0.0023
Epoch 5/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9968 - loss: 0.0109 - val_accuracy: 0.9997 - val_loss: 0.0018
Epoch 6/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9968 - loss: 0.0102 - val_accuracy: 0.9999 - val_loss: 0.0018
Epoch 7/20

<keras.src.callbacks.history.History at 0x7f995b809090>

In [13]:
y_pred_prob_scaled = smote_model.predict(X_test_scaled, verbose=1)
y_pred = (y_pred_prob_scaled > 0.5).astype(int)
print(classification_report(y_test, y_pred))

[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.68      0.83      0.75       147

    accuracy                           1.00     85443
   macro avg       0.84      0.91      0.87     85443
weighted avg       1.00      1.00      1.00     85443



In [25]:
# Now try balancing the loss to penalize false negatives
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(y_train_res), y=y_train_res)
class_weights_dict = dict(enumerate(class_weights))
smote_model.fit(X_train_res_scaled, y_train_res,
                epochs=20,
                batch_size=32,
                validation_split=0.2,
                class_weight=class_weights_dict,
                callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)],
                verbose=1)

Epoch 1/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 4ms/step - accuracy: 0.9704 - loss: 0.0832 - val_accuracy: 0.9932 - val_loss: 0.0180
Epoch 2/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9923 - loss: 0.0233 - val_accuracy: 0.9999 - val_loss: 0.0017
Epoch 3/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9949 - loss: 0.0155 - val_accuracy: 0.9992 - val_loss: 0.0030
Epoch 4/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9960 - loss: 0.0127 - val_accuracy: 0.9997 - val_loss: 0.0020
Epoch 5/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - accuracy: 0.9966 - loss: 0.0107 - val_accuracy: 0.9999 - val_loss: 0.0014
Epoch 6/20
[1m9951/9951[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - accuracy: 0.9969 - loss: 0.0100 - val_accuracy: 0.9999 - val_loss: 0.0016
Epoch 7/20

<keras.src.callbacks.history.History at 0x7f99dc61a780>

In [26]:
y_pred_prob = smote_model.predict(X_test_scaled, verbose=1)
y_pred = (y_pred_prob > 0.6).astype(int)
print(classification_report(y_test, y_pred))

[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.73      0.84      0.78       147

    accuracy                           1.00     85443
   macro avg       0.87      0.92      0.89     85443
weighted avg       1.00      1.00      1.00     85443



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

smote_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

disp = ConfusionMatrixDisplay(smote_matrix, display_labels=['No Fraud', 'Fraud'])
disp.plot()
plt.show()