In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve,
    precision_recall_curve, auc, confusion_matrix
)

import joblib


In [3]:
df = pd.read_csv(r"C:\Users\SACHIN\OneDrive\Desktop\fraudshield\data\PS_transaction_log.csv",encoding='ISO-8859-1')
print("Dataset shape:", df.shape)
df.head(5)

Dataset shape: (6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df_train = df.copy()

# Remove labels for unsupervised training
if "isFraud" in df_train.columns:
    df_train.drop(columns=["isFraud"], inplace=True)

if "isFlaggedFraud" in df_train.columns:
    df_train.drop(columns=["isFlaggedFraud"], inplace=True)

print("Training columns:", df_train.columns.tolist())


Training columns: ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest']


In [5]:
# Labeled data for evaluation
if "isFraud" not in df.columns:
    raise Exception("Dataset must contain isFraud for evaluation.")

df_eval = df.copy()
y_true = df_eval["isFraud"]
print("Fraud distribution:\n", y_true.value_counts())


Fraud distribution:
 isFraud
0    6354407
1       8213
Name: count, dtype: int64


In [6]:
# Encode transaction type
le = LabelEncoder()
df_train["type_code"] = le.fit_transform(df_train["type"])
df_eval["type_code"] = le.transform(df_eval["type"])

# Derived features
df_train["balance_delta_org"] = df_train["newbalanceOrig"] - df_train["oldbalanceOrg"]
df_train["balance_delta_dest"] = df_train["newbalanceDest"] - df_train["oldbalanceDest"]

df_eval["balance_delta_org"] = df_eval["newbalanceOrig"] - df_eval["oldbalanceOrg"]
df_eval["balance_delta_dest"] = df_eval["newbalanceDest"] - df_eval["oldbalanceDest"]

features = [
    "amount",
    "oldbalanceOrg", "newbalanceOrig",
    "oldbalanceDest", "newbalanceDest",
    "step", "type_code",
    "balance_delta_org", "balance_delta_dest"
]

X_train = df_train[features].fillna(0)
X_eval = df_eval[features].fillna(0)

print("Training feature matrix:", X_train.shape)
print("Evaluation feature matrix:", X_eval.shape)


Training feature matrix: (6362620, 9)
Evaluation feature matrix: (6362620, 9)


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled = scaler.transform(X_eval)

print("Scaling complete.")


Scaling complete.


In [8]:
model = IsolationForest(
    n_estimators=200,
    contamination=0.002,  
    max_samples='auto',
    random_state=42
)

model.fit(X_train_scaled)
print("Isolation Forest trained successfully.")


Isolation Forest trained successfully.


In [9]:
# -1 → anomaly | 1 → normal
pred_if = model.predict(X_eval_scaled)

df_eval["pred_label"] = np.where(pred_if == -1, 1, 0)  # 1 = suspicious
df_eval["score"] = -model.decision_function(X_eval_scaled)  # higher = more suspicious

df_eval[["isFraud", "pred_label", "score"]].head(10)


Unnamed: 0,isFraud,pred_label,score
0,0,0,-0.316905
1,0,0,-0.339855
2,1,0,-0.309073
3,1,0,-0.330589
4,0,0,-0.334581
5,0,0,-0.335601
6,0,0,-0.316719
7,0,0,-0.316063
8,0,0,-0.344152
9,0,0,-0.320807


In [23]:
df_eval[df_eval["pred_label"] == 1][
    ["type","amount","oldbalanceOrg","newbalanceOrig","score", "pred_label", "isFraud"]
].head(20)


Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,score,pred_label,isFraud
375,TRANSFER,2545478.01,0.0,0.0,0.001332,1,0
432,CASH_IN,349505.89,7330235.59,7679741.48,0.01205,1,0
656,CASH_IN,770537.37,8499043.13,8499043.13,0.011384,1,0
1324,CASH_IN,26295.3,8723514.56,8749809.86,0.001186,1,0
1818,TRANSFER,2317408.88,4165916.16,1848507.28,0.057722,1,0
1823,TRANSFER,2604219.11,575667.54,0.0,0.020574,1,0
4441,CASH_OUT,10000000.0,10000000.0,0.0,0.021341,1,1
4492,CASH_IN,546131.94,5049604.5,5595736.44,0.007954,1,0
4519,CASH_IN,17058.54,10455309.26,10472367.8,0.002338,1,0
4968,CASH_IN,360262.92,11163522.31,11523785.23,0.016991,1,0


In [None]:
y_pred = df_eval["pred_label"]
scores = df_eval["score"]

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1-score:", f1_score(y_true, y_pred))

# AUC
roc_auc = roc_auc_score(y_true, scores)
print("ROC-AUC:", roc_auc)

# PR-AUC
precision, recall, _ = precision_recall_curve(y_true, scores)
pr_auc = auc(recall, precision)
print("PR-AUC:", pr_auc)

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))


Accuracy: 0.9968580553294083
Precision: 0.037246581801037244
Recall: 0.057713381224887374
F1-score: 0.045274368403457665
ROC-AUC: 0.8638808580554904
PR-AUC: 0.013168276949891087

Confusion Matrix:
 [[6342155   12252]
 [   7739     474]]


In [None]:
joblib.dump(model, "model_if.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "type_encoder.pkl")

print("Model, scaler, and encoder saved successfully!")


Model, scaler, and encoder saved successfully!
