In [3]:
import pandas as pd 

df = pd.read_csv(r'C:\Users\RISHI RAMAN\Desktop\ml\Truemodel\fraud_detection.csv')

In [5]:
df

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TRIES
0,1296013,2024-01-04 11:11:06,3664,5239,19.15,72178.0,16,0,3.0
1,914276,2024-01-01 09:42:19,3511,5877,121.11,19023.0,29,0,3.0
2,423923,2024-01-30 10:26:00,1150,4479,10.49,1159.0,21,0,2.0
3,514143,2024-01-09 13:02:49,3114,5352,13.09,29663.0,28,0,2.0
4,706270,2024-01-10 07:50:14,1555,3802,50.13,40740.0,10,0,1.0
...,...,...,...,...,...,...,...,...,...
1755150,802172,2024-01-08 07:42:54,2092,6281,4.35,32591.0,5,0,0.0
1755151,998779,2024-01-12 17:26:29,1020,2893,112.42,9078.0,4,0,3.0
1755152,477151,2024-01-20 21:25:02,1704,6447,23.18,11028.0,22,0,0.0
1755153,1035289,2024-01-29 10:18:55,1434,3696,22.14,35663.0,4,0,4.0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Feature selection
features = ['TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TRIES', 'CUSTOMER_ID', 'TERMINAL_ID']
X = df[features]
y = df['TX_FRAUD']

# Handle missing values
imputer = SimpleImputer(strategy='mean')

X_imputed = imputer.fit_transform(X)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


[[522155    706]
 [    11   3675]]
              precision    recall  f1-score   support

           0     1.0000    0.9986    0.9993    522861
           1     0.8388    0.9970    0.9111      3686

    accuracy                         0.9986    526547
   macro avg     0.9194    0.9978    0.9552    526547
weighted avg     0.9989    0.9986    0.9987    526547



In [9]:
from xgboost import XGBClassifier
model1 = XGBClassifier(scale_pos_weight=999, use_label_encoder=False, eval_metric='logloss')


In [10]:
model1.fit(X_train, y_train)

# Predict
y_pred = model1.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[[522288    573]
 [    65   3621]]
              precision    recall  f1-score   support

           0     0.9999    0.9989    0.9994    522861
           1     0.8634    0.9824    0.9190      3686

    accuracy                         0.9988    526547
   macro avg     0.9316    0.9906    0.9592    526547
weighted avg     0.9989    0.9988    0.9988    526547



In [11]:
# === Model Training Section ===

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import joblib

# Feature engineering
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
df['TX_HOUR'] = df['TX_DATETIME'].dt.hour
df['IS_WEEKEND'] = (df['TX_DATETIME'].dt.weekday >= 5).astype(int)

# Selected features
features = ['TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TRIES', 'TX_HOUR', 'IS_WEEKEND']
X = df[features]
y = df['TX_FRAUD']

# Imputation & scaling
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

# XGBoost model with class imbalance handling
model = XGBClassifier(
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Pipeline: Imputer → Scaler → Model
pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler),
    ('model', model)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Fit model
pipeline.fit(X_train, y_train)

# Save pipeline
joblib.dump(pipeline, 'fraud_detection_pipeline.pkl')


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


['fraud_detection_pipeline.pkl']