In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix




In [21]:
data = pd.read_csv('creditcard.csv')

In [22]:
splitter = StratifiedShuffleSplit(n_splits=1,test_size=0.2)


In [23]:
for train_index, test_index in splitter.split(data,data['Class']):
    train_data = data.iloc[train_index]
    validation_data = data.iloc[test_index]

In [24]:
X_train = train_data.drop(['Class'],axis=1)
y_train = train_data['Class']

X_validation = validation_data.drop(['Class'],axis=1)
y_validation = validation_data['Class']

In [25]:
numerical_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['category','bool','object']).columns.tolist()

In [26]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_pipeline,numerical_features),
        ('cat',categorical_pipeline,categorical_features)
    ]
)

In [28]:
reg = LogisticRegression()
xgb = XGBClassifier()
svm = SVC()
models = [reg,xgb,svm]

In [None]:
best_roc_auc = -1
best_model = None

for model in models:
    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
    
    # Fit the pipeline on the training data
    pipe.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_val_pred = pipe.predict(X_validation)
    y_val_prob = pipe.predict_proba(X_validation)[:, 1]
    
    # Calculate performance metrics
    roc_auc = roc_auc_score(y_validation, y_val_prob)
    precision = precision_score(y_validation, y_val_pred)
    recall = recall_score(y_validation, y_val_pred)
    f1 = f1_score(y_validation, y_val_pred)
    conf_matrix = confusion_matrix(y_validation, y_val_pred)
    
    # Print the model's performance
    print(f"Model: {model.__class__.__name__}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("-" * 40)

Model: LogisticRegression
ROC AUC Score: 0.9666
Precision: 0.8933
Recall: 0.6837
F1 Score: 0.7746
Confusion Matrix:
[[56856     8]
 [   31    67]]
----------------------------------------
Model: XGBClassifier
ROC AUC Score: 0.9705
Precision: 0.9512
Recall: 0.7959
F1 Score: 0.8667
Confusion Matrix:
[[56860     4]
 [   20    78]]
----------------------------------------
