# **Credit Card Fraud Detection - Using Isolation Forest**

# Data Exploration & Preprocessing

### 1.1 Load Data


In [None]:
import pandas as pd
import kagglehub
import os


In [None]:
# Downloading the data from kaggle
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:01<00:00, 42.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3


In [None]:
csv_file = 'creditcard.csv'

data = pd.read_csv(os.path.join(path, csv_file))

data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### 1.2 Check for missing values & class distribution

In [None]:
# Addressing missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [None]:
# Analyzing class Distribution

# Converting counts in 'Class Column to Percentage.

class_distribution = data['Class'].value_counts(normalize = True) * 100
print("Class Distribution:\n", class_distribution)

Class Distribution:
 Class
0    99.827251
1     0.172749
Name: proportion, dtype: float64


### 1.3 Scaling the Dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Initializing the Scalar
scaler = MinMaxScaler()

# Scale 'Time' & 'Amount'
data[['Time', 'Amount']] = scaler.fit_transform(data[['Time', 'Amount']])

# Preview the Scaled Dataset
print("\nScaled Dataset:\n", data[['Time', 'Amount']].head())


Scaled Dataset:
        Time    Amount
0  0.000000  0.005824
1  0.000000  0.000105
2  0.000006  0.014739
3  0.000006  0.004807
4  0.000012  0.002724


# **Data Prepration**

### 2.1 Spliting the Dataset

## 2.2 Understanding the Target Variable (`Class`)
- **`Class = 0`**: Non-fraudulent transactions (normal).
- **`Class = 1`**: Fraudulent transactions (anomalies).


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Separate features and target
X = data.drop('Class', axis=1)  # Features
y = data['Class']               # Target


In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Check the distribution in the splits

print("Training set distribution:\n", y_train.value_counts(normalize=True) * 100)

print("\nTesting set distribution:\n", y_test.value_counts(normalize=True) * 100)

Training set distribution:
 Class
0    99.827075
1     0.172925
Name: proportion, dtype: float64

Testing set distribution:
 Class
0    99.827955
1     0.172045
Name: proportion, dtype: float64


# **Building and Training the Isolation Forest Model**

In [None]:
from sklearn.ensemble import IsolationForest
import numpy as np

### 3.1 Define Parameters of the model

In [None]:
# Initialize the Isolation Forest model
iso_forest = IsolationForest(
    n_estimators=500,           # Number of trees in the forest
    contamination=0.3,       # Proportion of anomalies (0.17% from the dataset description)
    random_state=42,            # Ensures consistent results
)

In [None]:
# Train the model on the training data (only features, no target)
iso_forest.fit(X_train)

In [None]:
# Predict anomalies (fraud) on training and testing sets
y_train_pred = iso_forest.predict(X_train)
y_test_pred = iso_forest.predict(X_test)

In [None]:
# Map predictions to 0 and 1
# -1 indicates an anomaly, 1 indicates normal in Isolation Forest
y_train_pred = np.where(y_train_pred == -1, 1, 0)
y_test_pred = np.where(y_test_pred == -1, 1, 0)

In [None]:
# Print some results for verification
print("\nSample Predictions on Test Data:")
print(y_test_pred[:10])


Sample Predictions on Test Data:
[0 1 1 0 1 0 0 0 0 0]


# Evaluating the Isolation Forest Model


In [None]:
from sklearn.metrics import classification_report, precision_recall_curve, auc
import matplotlib.pyplot as plt

In [None]:
# Evaluate on the test set
print("Classification Report (Test Set):\n")
print(classification_report(y_test, y_test_pred, target_names=['Non-Fraud', 'Fraud']))


Classification Report (Test Set):

              precision    recall  f1-score   support

   Non-Fraud       1.00      0.70      0.82     56864
       Fraud       0.01      0.96      0.01        98

    accuracy                           0.70     56962
   macro avg       0.50      0.83      0.42     56962
weighted avg       1.00      0.70      0.82     56962



In [None]:
# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred)
pr_auc = auc(recall, precision)

print(f"\nArea Under Precision-Recall Curve (AUPRC): {pr_auc:.4f}")



Area Under Precision-Recall Curve (AUPRC): 0.4824


### 4.1 Precision-Recall Curve

# Improving the Model using SMOTE

In [None]:
!pip install imbalanced-learn




In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, auc


In [None]:
X = data.drop('Class', axis=1)  # Features
y = data['Class']  # Target (fraud or non-fraud)


In [None]:
# Spliting the data into Training set and Testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Applying SMOTE to the Training Set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)



In [None]:
# Training the Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)


In [None]:
# Evaluating the Model Perfomance using AUPRC (Area Under Precision-Recall Curve)

# Get predictions for the test set
y_prob = clf.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class (fraud)

# Calculate precision, recall, and thresholds
precision, recall, _ = precision_recall_curve(y_test, y_prob)

# Calculate AUPRC
pr_auc = auc(recall, precision)
print(f"Area Under Precision-Recall Curve (AUPRC): {pr_auc:.4f}")


Area Under Precision-Recall Curve (AUPRC): 0.8780


In [None]:
# Assuming clf is your trained classifier after SMOTE
y_pred = clf.predict(X_test)


In [None]:
report = classification_report(y_test, y_pred, target_names=['Non-Fraud', 'Fraud'])
print(report)


              precision    recall  f1-score   support

   Non-Fraud       1.00      1.00      1.00     85307
       Fraud       0.85      0.88      0.86       136

    accuracy                           1.00     85443
   macro avg       0.92      0.94      0.93     85443
weighted avg       1.00      1.00      1.00     85443

