In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder



In [13]:
data = pd.read_csv('creditcard.csv')

In [14]:
splitter = StratifiedShuffleSplit(n_splits=1,test_size=0.2)


In [15]:
for train_index, test_index in splitter.split(data,data['Class']):
    train_data = data.iloc[train_index]
    validation_data = data.iloc[test_index]

In [16]:
X_train = train_data.drop(['Class'],axis=1)
y_train = train_data['Class']

X_validation = validation_data.drop(['Class'],axis=1)
y_validation = validation_data['Class']

In [17]:
numerical_features = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['category','bool','object']).columns.tolist()

In [18]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_pipeline,numerical_features),
        ('cat',categorical_pipeline,categorical_features)
    ]
)

In [20]:
reg = LogisticRegression(solver='liblinear')
rf = RandomForestClassifier()
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
svm = SVC(probability=True)
models = [reg,rf,xgb,svm]

In [21]:
param_grids = [
    {
        'model__penalty': ['l1', 'l2'],
        'model__C': [0.01, 0.1, 1, 10],
        'model__class_weight': [None, 'balanced']
    },
    {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 5, 10],
        'model__min_samples_split': [2, 5],
        'model__class_weight': [None, 'balanced']
    },
    {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
        'model__subsample': [0.8, 1.0],
        'model__scale_pos_weight': [1, 10, 25, 50, 75, 99]
    },
    {
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto'],
        'model__kernel': ['rbf', 'linear'],
        'model__class_weight': [None, 'balanced']
    },
]

best_accuracy = -1
best_model = None
for model, param_grid in zip(models, param_grids):
    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
    
    grid_search = GridSearchCV(pipe, param_grid, cv=2, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    y_val_pred = grid_search.predict(X_validation)
    y_val_prob = grid_search.predict_proba(X_validation)[:, 1]
    
    roc_auc = roc_auc_score(y_validation, y_val_prob)
    precision = precision_score(y_validation, y_val_pred)
    recall = recall_score(y_validation, y_val_pred)
    f1 = f1_score(y_validation, y_val_pred)
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 40)
    
    if roc_auc > best_score:
        best_score = roc_auc
        best_model = grid_search.best_estimator_

KeyboardInterrupt: 