In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'fraud-detection' dataset.
Path to dataset files: /kaggle/input/fraud-detection


In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

In [7]:
DATA_DIR = "/kaggle/input/fraud-detection"

train_path =os.path.join(DATA_DIR, "fraudTrain.csv")
test_path =os.path.join(DATA_DIR, "fraudTest.csv")

print("Train file exists:", os.path.exists(train_path))
print("Test file exists:", os.path.exists(test_path))

Train file exists: True
Test file exists: True


In [8]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

print("\nTrain columns:\n", train_df.columns)

Train shape: (1296675, 23)
Test shape: (555719, 23)

Train columns:
 Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [9]:
train_df["source"] = "train"
test_df["source"]  = "test"

full_df = pd.concat([train_df, test_df], ignore_index=True)
print("\nCombined shape:", full_df.shape)


Combined shape: (1852394, 24)


In [10]:
cols_to_drop = ["Unnamed: 0", "first", "last", "street", "dob", "trans_num", "trans_date_trans_time"]

print("\nColumns after dropping some IDs/PII:\n", full_df.columns)



Columns after dropping some IDs/PII:
 Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'source'],
      dtype='object')


In [11]:
cat_cols = full_df.select_dtypes(include=["object"]).columns
print("\nCategorical columns to encode:\n", list(cat_cols))


Categorical columns to encode:
 ['trans_date_trans_time', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num', 'source']


In [12]:
train_proc = full_df[full_df["source"] == "train"].drop(columns=["source"])
test_proc  = full_df[full_df["source"] == "test"].drop(columns=["source"])

X = train_proc.drop(columns=["is_fraud"])
y = train_proc["is_fraud"]

X_test_full = test_proc.drop(columns=["is_fraud"])
y_test_full = test_proc["is_fraud"]

print("\nProcessed train shape:", X.shape)
print("Processed test shape :", X_test_full.shape)

print("\nClass distribution in training data:")
print(y.value_counts(normalize=True))


Processed train shape: (1296675, 22)
Processed test shape : (555719, 22)

Class distribution in training data:
is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64


In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain samples:", X_train.shape[0])
print('val  samples:', X_val.shape[0])


Train samples: 1037340
val  samples: 259335


In [16]:
non_numeric_cols = [
    "Unnamed: 0",
    "trans_date_trans_time",
    "cc_num", # Often treated as an identifier, not a feature for scaling
    "merchant",
    "category",
    "first",
    "last",
    "gender",
    "street",
    "city",
    "state",
    "job",
    "dob",
    "trans_num", # Often treated as an identifier, not a feature for scaling
]

# Ensure only numerical columns are passed to the scaler
X_train_numeric = X_train.drop(columns=non_numeric_cols, errors='ignore')
X_val_numeric   = X_val.drop(columns=non_numeric_cols, errors='ignore')
X_test_full_numeric = X_test_full.drop(columns=non_numeric_cols, errors='ignore')

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_numeric)
X_val_scaled   = scaler.transform(X_val_numeric)
X_test_scaled  = scaler.transform(X_test_full_numeric)

In [14]:
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ),
    "Decision Tree": DecisionTreeClassifier(
        class_weight="balanced",
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators = 200,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )
}

results = {}
best_model = None
best_name = None
best_f1 = 0.0

In [18]:
results = {}
best_f1 = 0.0
best_model = None
best_name = ""

# 1) Logistic Regression
print("Training Logistic Regression...")
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

y_val_pred = log_model.predict(X_val_scaled)

acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred, zero_division=0)
rec = recall_score(y_val, y_val_pred, zero_division=0)
f1 = f1_score(y_val, y_val_pred, zero_division=0)

print("Logistic Regression Results:")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

results["Logistic Regression"] = {
    "accuracy": acc,
    "precision": prec,
    "recall": rec,
    "f1": f1
}

if f1 > best_f1:
    best_f1 = f1
    best_model = log_model
    best_name = "Logistic Regression"


# 2) Decision Tree
print("\nTraining Decision Tree...")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)

y_val_pred = dt_model.predict(X_val_scaled)

acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred, zero_division=0)
rec = recall_score(y_val, y_val_pred, zero_division=0)
f1 = f1_score(y_val, y_val_pred, zero_division=0)

print("Decision Tree Results:")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

results["Decision Tree"] = {
    "accuracy": acc,
    "precision": prec,
    "recall": rec,
    "f1": f1
}

if f1 > best_f1:
    best_f1 = f1
    best_model = dt_model
    best_name = "Decision Tree"


# 3) Random Forest
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=10,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

y_val_pred = rf_model.predict(X_val_scaled)

acc = accuracy_score(y_val, y_val_pred)
prec = precision_score(y_val, y_val_pred, zero_division=0)
rec = recall_score(y_val, y_val_pred, zero_division=0)
f1 = f1_score(y_val, y_val_pred, zero_division=0)

print("Random Forest Results:")
print("Accuracy :", acc)
print("Precision:", prec)
print("Recall   :", rec)
print("F1 Score :", f1)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

results["Random Forest"] = {
    "accuracy": acc,
    "precision": prec,
    "recall": rec,
    "f1": f1
}

if f1 > best_f1:
    best_f1 = f1
    best_model = rf_model
    best_name = "Random Forest"


# Summary
print("Validation Results Summary:")
for name in results:
    print(name, "-> F1:", results[name]["f1"], "| Acc:", results[name]["accuracy"])

print("Best model on validation set:", best_name, "with F1 =", best_f1)


Training Logistic Regression...
Logistic Regression Results:
Accuracy : 0.9936992692849018
Precision: 0.0
Recall   : 0.0
F1 Score : 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    257834
           1       0.00      0.00      0.00      1501

    accuracy                           0.99    259335
   macro avg       0.50      0.50      0.50    259335
weighted avg       0.99      0.99      0.99    259335


Training Decision Tree...
Decision Tree Results:
Accuracy : 0.9931748510613685
Precision: 0.41535556954059155
Recall   : 0.4397068620919387
F1 Score : 0.42718446601941745
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.42      0.44      0.43      1501

    accuracy                           0.99    259335
   macro avg       0.71      0.72      0.71    259335
weighted avg       0.99      0.99      0.99    

In [20]:
# Testing on test dataset
print("Evaluating the best model:", best_name)

# Predict on test data
y_test_pred = best_model.predict(X_test_scaled)

# Check accuracy, precision, recall, f1
test_acc = accuracy_score(y_test_full, y_test_pred)
test_prec = precision_score(y_test_full, y_test_pred, zero_division=0)
test_rec = recall_score(y_test_full, y_test_pred, zero_division=0)
test_f1 = f1_score(y_test_full, y_test_pred, zero_division=0)

print("Test Results:")
print("Accuracy :", test_acc)
print("Precision:", test_prec)
print("Recall   :", test_rec)
print("F1 Score :", test_f1)

if hasattr(best_model, "predict_proba"):
    y_test_proba = best_model.predict_proba(X_test_scaled)[:, 1]
    test_roc_auc = roc_auc_score(y_test_full, y_test_proba)
    print("ROC AUC  :", test_roc_auc)
else:
    print("ROC AUC  : Not available for this model")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test_full, y_test_pred))

# Confusion Matrix
cm = confusion_matrix(y_test_full, y_test_pred)
print("\nConfusion Matrix:")
print(cm)


Evaluating the best model: Random Forest
Test Results:
Accuracy : 0.9956308853935172
Precision: 0.29756795422031473
Recall   : 0.09696969696969697
F1 Score : 0.14627285513361463
ROC AUC  : 0.8253077583418242

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.30      0.10      0.15      2145

    accuracy                           1.00    555719
   macro avg       0.65      0.55      0.57    555719
weighted avg       0.99      1.00      0.99    555719


Confusion Matrix:
[[553083    491]
 [  1937    208]]
