In [1]:
from pathlib import Path
import pandas as pd

DATA_PATH = Path(r"C:\BFSI\card_fraud.csv")

df = pd.read_csv(DATA_PATH)

print("Dataset loaded successfully!")
print(df.head())


Dataset loaded successfully!
   Transaction_ID   User_ID  Transaction_Amount Transaction_Date  \
0            1001  68389745            69635000    1/1/2024 0:00   
1            1002  42122340            53486000    1/1/2024 0:01   
2            1003  87539955            24262000    1/1/2024 0:02   
3            1004  98657863            56019000    1/1/2024 0:03   
4            1005  88084360            87823000    1/1/2024 0:04   

  Transaction_Time Transaction_Location  Merchant_ID  Device_ID Card_Type  \
0          0:00:00         Surkhandarya         6710       2060    UzCard   
1          0:01:00             Namangan         6498       2797    UzCard   
2          0:02:00               Navoiy         5039       2519      Humo   
3          0:03:00              Bukhara         6115       2641      Humo   
4          0:04:00              Andijan         5072       2923      Humo   

  Transaction_Currency Transaction_Status  Previous_Transaction_Count  \
0                  UZS    

In [2]:
# Identify target column
if "isFraud" in df.columns:
    target = "isFraud"
else:
    raise ValueError("Target column 'isFraud' not found. Please verify dataset.")

# Drop ID-like columns (do NOT use in training)
id_columns = ["Transaction_ID", "User_ID", "Merchant_ID", "Device_ID"]
existing_ids = [col for col in id_columns if col in df.columns]

X = df.drop(columns=existing_ids + [target])
y = df[target]


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 80000
Testing samples: 20000


In [4]:
''Build Preprocessing Pipeline
Includes:
Imputation
Scaling
One-Hot Encoding'''
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np

numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

numeric_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transform, numeric_features),
        ("cat", categorical_transform, categorical_features),
    ]
)

preprocessor


SyntaxError: unterminated triple-quoted string literal (detected at line 32) (2939586900.py, line 5)

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

model_lr.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate(model, name):
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    
    print(f"\n=== {name} Evaluation ===")
    print(classification_report(y_test, preds))
    print("ROC AUC Score:", roc_auc_score(y_test, probs))
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))

evaluate(model_lr, "Logistic Regression")
evaluate(model_rf, "Random Forest")

try:
    evaluate(model_lgb, "LightGBM")
except:
    pass


In [None]:
scores = {}

for name, model in {
    "LR": model_lr,
    "RF": model_rf,
    "LGB": model_lgb if 'model_lgb' in globals() else None
}.items():
    if model:
        probs = model.predict_proba(X_test)[:, 1]
        scores[name] = roc_auc_score(y_test, probs)

best_model_name = max(scores, key=scores.get)
print("Best Model:", best_model_name)

best_model = {
    "LR": model_lr,
    "RF": model_rf,
    "LGB": model_lgb if 'model_lgb' in globals() else None
}[best_model_name]


In [5]:
import joblib

MODEL_PATH = r"C:\BFSI\fraud_detection_model.pkl"
joblib.dump(best_model, MODEL_PATH)

print("Model saved to:", MODEL_PATH)


NameError: name 'best_model' is not defined