In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix




In [12]:
data = pd.read_csv('creditcard.csv')

In [13]:
splitter = StratifiedShuffleSplit(n_splits=1,test_size=0.2)


In [14]:
for train_index, test_index in splitter.split(data,data['Class']):
    train_data = data.iloc[train_index]
    validation_data = data.iloc[test_index]

In [15]:
X_train = train_data.drop(['Class'],axis=1)
y_train = train_data['Class']

X_validation = validation_data.drop(['Class'],axis=1)
y_validation = validation_data['Class']

In [16]:
numerical_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['category','bool','object']).columns.tolist()

In [17]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_pipeline,numerical_features),
        ('cat',categorical_pipeline,categorical_features)
    ]
)

In [19]:
reg = LogisticRegression()
xgb = XGBClassifier()
svm = SVC(probability=True)
models = [reg,xgb,svm]

In [20]:
best_roc_auc = -1
best_model = None

for model in models:
    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
    
    pipe.fit(X_train, y_train)
    
    y_val_pred = pipe.predict(X_validation)
    y_val_prob = pipe.predict_proba(X_validation)[:, 1]
    
    roc_auc = roc_auc_score(y_validation, y_val_prob)
    precision = precision_score(y_validation, y_val_pred)
    recall = recall_score(y_validation, y_val_pred)
    f1 = f1_score(y_validation, y_val_pred)
    conf_matrix = confusion_matrix(y_validation, y_val_pred)
    
    print(f"Model: {model.__class__.__name__}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("-" * 40)

Model: LogisticRegression
ROC AUC Score: 0.9738
Precision: 0.8611
Recall: 0.6327
F1 Score: 0.7294
Confusion Matrix:
[[56854    10]
 [   36    62]]
----------------------------------------
Model: XGBClassifier
ROC AUC Score: 0.9877
Precision: 0.9048
Recall: 0.7755
F1 Score: 0.8352
Confusion Matrix:
[[56856     8]
 [   22    76]]
----------------------------------------
Model: SVC
ROC AUC Score: 0.9440
Precision: 0.9296
Recall: 0.6735
F1 Score: 0.7811
Confusion Matrix:
[[56859     5]
 [   32    66]]
----------------------------------------
