In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
mlg_ulb_creditcardfraud_path = kagglehub.dataset_download('mlg-ulb/creditcardfraud')

print('Data source import complete.')

Using Colab cache for faster access to the 'creditcardfraud' dataset.
Data source import complete.


In [2]:
import os
import pandas as pd
import numpy as np

# Load dataset
data_path = os.path.join(mlg_ulb_creditcardfraud_path, 'creditcard.csv')
df = pd.read_csv(data_path)

# Calculate the engineered feature
df['TransactionDensity'] = df['Amount'] / (df['Time'] + 1)

# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# Reducing the size of the dataset
total_records = len(df)
target_sample_size = int(total_records * 0.25)
df_majority = df[df['Class'] == 0]
df_minority = df[df['Class'] == 1]
minority_size = len(df_minority)
majority_size_needed = target_sample_size - minority_size

if majority_size_needed <= 0:
    df_sampled = df_minority.copy()
else:
    df_majority_undersampled = df_majority.sample(
        n=majority_size_needed,
        random_state=42
    )
    df_sampled = pd.concat([df_majority_undersampled, df_minority]).sample(frac=1, random_state=42)

df = df_sampled

print(f"New sampled DataFrame size: {len(df)}")


# Define Features and Target
features_to_scale_names = ['Amount', 'TransactionDensity']
pca_features = [f'V{i}' for i in range(1, 29)]

# X is the full feature matrix
X = df[features_to_scale_names + pca_features].copy()

# y is the target variable
y = df['Class']

New sampled DataFrame size: 71201


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data first
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Scale the data
scaler = StandardScaler()
scaler.fit(X_train[features_to_scale_names])

X_train[features_to_scale_names] = scaler.transform(X_train[features_to_scale_names])
X_test[features_to_scale_names] = scaler.transform(X_test[features_to_scale_names])

In [4]:
from sklearn.ensemble import IsolationForest

iforest = IsolationForest(contamination=0.01, random_state=42)
# Re-create the full scaled X for Isolation Forest fit (preserving index order)
X_full_scaled = pd.concat([X_train, X_test], axis=0)
X_full_scaled = X_full_scaled.sort_index()

df['isOutlier'] = iforest.fit_predict(X_full_scaled)

print("Total Anomaly Counts:")
print(df['isOutlier'].value_counts())

Total Anomaly Counts:
isOutlier
 1    70489
-1      712
Name: count, dtype: int64


In [5]:
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

print("Recall Scores\n")

# --- Logistic Regression ---
lr_model = LogisticRegression(solver='liblinear', random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print(f"Logistic Regression Recall: {recall_score(y_test, y_pred_lr):.4f}")

# --- Decision Trees ---
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print(f"Decision Tree Recall: {recall_score(y_test, y_pred_dt):.4f}")

# --- Random Forest ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest Recall: {recall_score(y_test, y_pred_rf):.4f}")

# --- Gradient Boosting ---
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
print(f"Gradient Boosting Recall: {recall_score(y_test, y_pred_gb):.4f}")

# --- Linear SVC (Optimized Replacement for SVC) ---
svm_model = LinearSVC(C=1, random_state=42, dual=False)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print(f"Linear SVC Recall: {recall_score(y_test, y_pred_svm):.4f}")

Recall Scores

Logistic Regression Recall: 0.8041
Decision Tree Recall: 0.8108
Random Forest Recall: 0.8311
Gradient Boosting Recall: 0.8176
Linear SVC Recall: 0.8041


In [6]:
from keras.models import Sequential
from keras.layers import Dense

# Define model
nn_model = Sequential()
nn_model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

# Compile model
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
nn_model.fit(X_train, y_train, epochs=10, batch_size=128, verbose=0)

# Evaluate model
y_pred_nn = nn_model.predict(X_test, verbose=0)
y_pred_nn_binary = (y_pred_nn > 0.5).astype('int32')

print(f"Neural Network Recall: {recall_score(y_test, y_pred_nn_binary):.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Neural Network Recall: 0.8311


In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [5, 10, 15]}

print("GridSearchCV")
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='recall',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("Grid Search Complete.")
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Recall Score:", f"{grid_search.best_score_:.4f}")

best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
print("Test Set Recall Score (using Best Model):", f"{recall_score(y_test, y_pred_best_rf):.4f}")

GridSearchCV
Grid Search Complete.
Best Hyperparameters: {'max_depth': 15, 'n_estimators': 50}
Best Cross-Validation Recall Score: 0.8404
Test Set Recall Score (using Best Model): 0.8378
