Importing Libraries

In [113]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

Load Dataset

In [114]:
crd_data = pd.read_csv("creditcard.csv")
crd_df = pd.DataFrame(crd_data)
print(crd_df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

Sanity Check

In [115]:
print("Describe:",crd_df.describe())
print("info:",crd_df.info())

Describe:                 Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.168375e-15  3.416908e-16 -1.379537e-15  2.074095e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V8            V9  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   9.604066e-16  1.487313e-15 -5.556467e-16  1.213481e-16 -2.406331e-15   
std    1.380247e+00  1.332271e+0

In [116]:
print(crd_df.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [117]:
print(crd_df.nunique())

Time      124592
V1        275663
V2        275663
V3        275663
V4        275663
V5        275663
V6        275663
V7        275663
V8        275663
V9        275663
V10       275663
V11       275663
V12       275663
V13       275663
V14       275663
V15       275663
V16       275663
V17       275663
V18       275663
V19       275663
V20       275663
V21       275663
V22       275663
V23       275663
V24       275663
V25       275663
V26       275663
V27       275663
V28       275663
Amount     32767
Class          2
dtype: int64


In [118]:
crd_x = crd_df.drop(columns=["Class"])
crd_y = crd_df["Class"]
print(crd_x.head())
print(crd_y.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V20       V21       V22       V23       V24  \
0  0.098698  0.363787  ...  0.251412 -0.018307  0.277838 -0.110474  0.066928   
1  0.085102 -0.255425  ... -0.069083 -0.225775 -0.638672  0.101288 -0.339846   
2  0.247676 -1.514654  ...  0.524980  0.247998  0.771679  0.909412 -0.689281   
3  0.377436 -1.387024  ... -0.208038 -0.108300  0.005274 -0.190321 -1.175575   
4 -0.270533  0.817739  ...  0.408542 -0.009431  0.798278 -0.137458  0.141267   

        V25       V26       V27 

Feature Selection

In [119]:
select_k_best = SelectKBest(score_func=f_classif,k=15)
k_feature = select_k_best.fit(crd_x,crd_y)
selected_features = pd.DataFrame({
   'Feature': crd_x.columns,
   'Score': k_feature.scores_
}).sort_values(by='Score', ascending=False)
print(selected_features)

   Feature         Score
17     V17  33979.168593
14     V14  28695.547788
12     V12  20749.822361
10     V10  14057.979985
16     V16  11443.349428
3       V3  11014.508305
7       V7  10349.605408
11     V11   6999.355047
4       V4   5163.832114
18     V18   3584.380605
1       V1   2955.668946
9       V9   2746.600273
5       V5   2592.357929
2       V2   2393.401678
6       V6    543.510578
21     V21    465.916251
19     V19    344.990997
20     V20    114.999731
8       V8    112.548287
27     V27     88.045296
0     Time     43.252998
28     V28     25.901405
24     V24     14.850932
29  Amount      9.033345
13     V13      5.947672
26     V26      5.653653
15     V15      5.080193
25     V25      3.116062
23     V23      2.053476
22     V22      0.184706


In [120]:
crd_x_selected = crd_x[["V17","V14","V12","V10","V16","V3","V7","V11","V4","V18","V1","V9","V5","V2"]]
print(crd_x_selected)

             V17       V14       V12       V10       V16        V3        V7  \
0       0.207971 -0.311169 -0.617801  0.090794 -0.470401  2.536347  0.239599   
1      -0.114805 -0.143772  1.065235 -0.166974  0.463917  0.166480 -0.078803   
2       1.109969 -0.165946  0.066084  0.207643 -2.890083  1.773209  0.791461   
3      -0.684093 -0.287924  0.178228 -0.054952 -1.059647  1.792993  0.237609   
4      -0.237033 -1.119670  0.538196  0.753074 -0.451449  1.548718  0.592941   
...          ...       ...       ...       ...       ...       ...       ...   
284802  1.991691  4.626942  2.711941  4.356170  1.107641 -9.834783 -4.918215   
284803 -0.025693 -0.675143  0.915802 -0.975926 -0.711757  2.035030  0.024330   
284804  0.313502 -0.510602  0.063119 -0.484782  0.140716 -3.249640 -0.296827   
284805  0.509928  0.449624 -0.962886 -0.399126 -0.608577  0.702510 -0.686180   
284806 -0.660377 -0.084316 -0.031513 -0.915427 -0.302620  0.703337  1.577006   

             V11        V4       V18   

Train Test Split

In [121]:
crd_x_train, crd_x_test, crd_y_train, crd_y_test = train_test_split(crd_x_selected,crd_y,test_size=0.25,random_state=42)

Resampling

In [122]:
print(crd_y_train.value_counts())

Class
0    213226
1       379
Name: count, dtype: int64


In [123]:
crd_smote = SMOTE(sampling_strategy="minority")
crd_x_train_resample , crd_y_train_resample = crd_smote.fit_resample(crd_x_train, crd_y_train)

Model Training

In [124]:
crd_lr = LogisticRegression()
crd_lr.fit(crd_x_train_resample,crd_y_train_resample)
crd_pred_lr = crd_lr.predict(crd_x_test)

In [125]:
print("lr Accuracy:",accuracy_score(crd_y_test,crd_pred_lr))
print("lr Precision:",precision_score(crd_y_test,crd_pred_lr))
print("lr Recall:",recall_score(crd_y_test,crd_pred_lr))
print("lr F1 Score:",f1_score(crd_y_test,crd_pred_lr))
print("lr Confusion Matrix:\n",confusion_matrix(crd_y_test,crd_pred_lr))
print("lr Classification Report:\n",classification_report(crd_y_test,crd_pred_lr))


lr Accuracy: 0.9721496587174517
lr Precision: 0.049614643545279384
lr Recall: 0.911504424778761
lr F1 Score: 0.09410689812699863
lr Confusion Matrix:
 [[69116  1973]
 [   10   103]]
lr Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     71089
           1       0.05      0.91      0.09       113

    accuracy                           0.97     71202
   macro avg       0.52      0.94      0.54     71202
weighted avg       1.00      0.97      0.98     71202



In [None]:
crd_dt = DecisionTreeClassifier()
crd_dt.fit(crd_x_train_resample,crd_y_train_resample)
crd_pred_dt = crd_dt.predict(crd_x_test)

In [None]:
print("dt Accuracy:",accuracy_score(crd_y_test,crd_pred_dt))
print("dt Precision:",precision_score(crd_y_test,crd_pred_dt))
print("dt Recall:",recall_score(crd_y_test,crd_pred_dt))
print("dt F1 Score:",f1_score(crd_y_test,crd_pred_dt))
print("dt Confusion Matrix:\n",confusion_matrix(crd_y_test,crd_pred_dt))
print("dt Classification Report:\n",classification_report(crd_y_test,crd_pred_dt))

In [None]:
crd_rf = RandomForestClassifier(n_estimators=100,max_depth=5,min_samples_split=2,random_state=42,class_weight="balanced")
crd_rf.fit(crd_x_train_resample,crd_y_train_resample)
crd_pred_rf = crd_rf.predict(crd_x_test)

In [None]:
print("rf Accuracy:",accuracy_score(crd_y_test,crd_pred_rf))
print("rf Precision:",precision_score(crd_y_test,crd_pred_rf))
print("rf Recall:",recall_score(crd_y_test,crd_pred_rf))
print("rf F1 Score:",f1_score(crd_y_test,crd_pred_rf))
print("rf Confusion Matrix:\n",confusion_matrix(crd_y_test,crd_pred_rf))
print("rf Classification Report:\n",classification_report(crd_y_test,crd_pred_rf))

In [130]:
params = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [200, 300, 500],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 1, 5]
}

In [None]:
xgb_tuned = XGBClassifier(
    scale_pos_weight=50,
    eval_metric='logloss',
    max_depth =5,learning_rate=0.1,n_estimators=300,subsample=0.8,colsample_bytree=1.0,gamma=1,
    random_state=42
)

grid = RandomizedSearchCV(
    estimator=xgb_tuned,
    param_distributions=params,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1
)

xgb_tuned.fit(crd_x_train_resample, crd_y_train_resample)

# print("Best Parameters:", grid.best_params_)
# best_xgb = grid.best_estimator_

final_pred = xgb_tuned.predict(crd_x_test)

print("Final Accuracy:", accuracy_score(crd_y_test, final_pred))
print("Final Precision:", precision_score(crd_y_test, final_pred))
print("Final Recall:", recall_score(crd_y_test, final_pred))
print("Final F1:", f1_score(crd_y_test, final_pred))
print("Final Confusion Matrix:\n", confusion_matrix(crd_y_test, final_pred))
print("Final Classification Report:\n", classification_report(crd_y_test, final_pred))

Final Accuracy: 0.9920788741889273
Final Precision: 0.1546707503828484
Final Recall: 0.8938053097345132
Final F1: 0.26370757180156656
Final Confusion Matrix:
 [[70537   552]
 [   12   101]]
Final Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     71089
           1       0.15      0.89      0.26       113

    accuracy                           0.99     71202
   macro avg       0.58      0.94      0.63     71202
weighted avg       1.00      0.99      0.99     71202



In [139]:
y_proba = xgb_tuned.predict_proba(crd_x_test)[:, 1]

threshold = 0.50   # try 0.20, 0.25, 0.30
pred_thresh = (y_proba >= threshold).astype(int)

print("Tuned Precision:", precision_score(crd_y_test, pred_thresh))
print("Tuned Recall:", recall_score(crd_y_test, pred_thresh))
print("Tuned F1:", f1_score(crd_y_test, pred_thresh))
print(confusion_matrix(crd_y_test, pred_thresh))

Tuned Precision: 0.1546707503828484
Tuned Recall: 0.8938053097345132
Tuned F1: 0.26370757180156656
[[70537   552]
 [   12   101]]


In [144]:
import pickle

with open("xgb_fraud_model.pkl", "wb") as f:
    pickle.dump(xgb_tuned, f)

print("✅ Model saved successfully as xgb_fraud_model.pkl")


✅ Model saved successfully as xgb_fraud_model.pkl
