In [46]:
import pandas as pd
import numpy as np
import sqlite3
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as SKPipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from scipy.stats.mstats import winsorize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_recall_curve, classification_report
from scipy.stats import randint, uniform
from custom_transformers import InitialCleaner, Winsorizer

In [47]:
import sqlite3

# Connect to the SQLite database file
conn = sqlite3.connect('Database.db')
# Replace ‘Database.db' with the path to your .db file if the file is in another directory.

# Create a cursor object
cursor = conn.cursor()

# Query to retrieve table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")

# Fetch all table names
tables = cursor.fetchall()

# Print the table names
for table in tables:
    print(table[0])

# Close the connection
conn.close()


Electric_cars
Fraud_detection
Heart_disease
Insurance_Prediction
TripAdviser_Reviews
Ecommerce_data
Automobile_data
Supermarket_data


In [48]:
conn = sqlite3.connect('Database.db')
df = pd.read_sql_query('Select * from Fraud_detection' , conn)
df 


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0


In [49]:
# Separate target and features
X = df.drop('isFraud', axis=1)
y = df['isFraud']

# Identify column types
numeric_cols = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']
categorical_cols = ['type']


In [50]:
# Define InitialCleaner transformer
class InitialCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_drop = ['nameOrig', 'nameDest']
        self.null_like_values = ['', ' ', 'nan', 'NaN', 'NULL', 'None']
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X.drop(columns=self.columns_to_drop, errors='ignore', inplace=True)
        X.replace(self.null_like_values, np.nan, inplace=True)
        return X

In [51]:
# Define Winsorizer transformer
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, limits=(0.01, 0.01)):
        self.limits = limits

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = np.asarray(X)
        X_winsorized = np.apply_along_axis(
            lambda col: winsorize(col, limits=self.limits), axis=0, arr=X
        )
        return X_winsorized


In [52]:
# Numeric pipeline
numeric_pipeline = SKPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('winsor', Winsorizer()),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_pipeline = SKPipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [53]:
# Final pipeline with SMOTE and XGBoost
pipeline = ImbPipeline(steps=[
    ('initial_cleaner', InitialCleaner()),  # ✔️ Custom transformer
    ('preprocessor', preprocessor),         # ✔️ Column-wise preprocessing
    ('smote', SMOTE(random_state=42)),      # ✔️ Resampling
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        scale_pos_weight=(y.value_counts().iloc[0] / y.value_counts().iloc[1])
    ))
])

# Split data: train / val / test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3713, random_state=42, stratify=y
)

val_size = 1000000 / len(X)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=(1 - val_size / 0.3713),
    random_state=42,
    stratify=y_temp
)


In [54]:
# Hyperparameter search space
param_grid = {
    'classifier__n_estimators': randint(100, 300),
    'classifier__max_depth': [3, 6, 10],
    'classifier__learning_rate': uniform(0.01, 0.2),
    'classifier__subsample': uniform(0.6, 0.4),
    'classifier__colsample_bytree': uniform(0.6, 0.4),
    'classifier__gamma': uniform(0, 5),
    'classifier__min_child_weight': [1, 3, 5]
}

# Randomized Search CV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=20,
    scoring='f1',
    n_jobs=-1,
    cv=3,
    verbose=3,
    random_state=42,
    error_score='raise'
)

# Ensure labels are int
y_train = y_train.astype(int)
y_val = y_val.astype(int)
y_test = y_test.astype(int)



In [55]:
# Fit model
random_search.fit(X_train, y_train)

# Threshold tuning
best_model = random_search.best_estimator_
y_prob = best_model.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, y_prob)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]


Fitting 3 folds for each of 20 candidates, totalling 60 fits




[CV 3/3] END classifier__colsample_bytree=0.749816047538945, classifier__gamma=4.75357153204958, classifier__learning_rate=0.15639878836228102, classifier__max_depth=3, classifier__min_child_weight=1, classifier__n_estimators=202, classifier__subsample=0.7783331011414365;, score=0.044 total time=  41.8s
[CV 2/3] END classifier__colsample_bytree=0.9329770563201687, classifier__gamma=1.0616955533913808, classifier__learning_rate=0.04636499344142013, classifier__max_depth=3, classifier__min_child_weight=1, classifier__n_estimators=157, classifier__subsample=0.8099025726528951;, score=0.014 total time=  41.9s
[CV 1/3] END classifier__colsample_bytree=0.749816047538945, classifier__gamma=4.75357153204958, classifier__learning_rate=0.15639878836228102, classifier__max_depth=3, classifier__min_child_weight=1, classifier__n_estimators=202, classifier__subsample=0.7783331011414365;, score=0.042 total time=  38.4s
[CV 3/3] END classifier__colsample_bytree=0.7727780074568463, classifier__gamma=1.

In [56]:
print("Best Parameters:\n", random_search.best_params_)
print("Classification Report (Optimal Threshold):")
print(classification_report(y_val, (y_prob >= optimal_threshold).astype(int)))

# Save the trained pipeline and threshold
joblib.dump(best_model, "model/optimized_xgb_fraud_pipeline.pkl")
joblib.dump(optimal_threshold, "model/optimal_threshold.pkl")

Best Parameters:
 {'classifier__colsample_bytree': 0.6298202574719083, 'classifier__gamma': 4.9344346830025865, 'classifier__learning_rate': 0.1644489538593315, 'classifier__max_depth': 10, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 180, 'classifier__subsample': 0.88453678109946}
Classification Report (Optimal Threshold):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    998709
           1       0.93      0.88      0.91      1291

    accuracy                           1.00   1000000
   macro avg       0.97      0.94      0.95   1000000
weighted avg       1.00      1.00      1.00   1000000



['model/optimal_threshold.pkl']

In [91]:
# Final test evaluation
y_test_prob = best_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_prob >= optimal_threshold).astype(int)

print("Test Set Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Set Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1360683
           1       0.95      0.86      0.91      1758

    accuracy                           1.00   1362441
   macro avg       0.98      0.93      0.95   1362441
weighted avg       1.00      1.00      1.00   1362441

