In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import xgboost as xgb

In [37]:
df = pd.read_csv('creditcard.csv')

In [38]:
X = df.drop('Class', axis=1) 
y = df['Class']               

In [39]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [40]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [41]:
# Define the XGBoost model
model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    tree_method='gpu_hist',  
    gpu_id=0,  
    random_state=42
)

In [42]:
# Evaluate on the original dataset
model.fit(X_train, y_train)
y_pred_original = model.predict(X_test)
print("Original Dataset (Imbalanced):")
print(classification_report(y_test, y_pred_original))
print("Confusion Matrix (Original Dataset):")
print(confusion_matrix(y_test, y_pred_original))


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Original Dataset (Imbalanced):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.94      0.82      0.88       136

    accuracy                           1.00     85443
   macro avg       0.97      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443

Confusion Matrix (Original Dataset):
[[85300     7]
 [   24   112]]


In [43]:
# Define sampling methods
oversample = SMOTE(random_state=42)
undersample = RandomUnderSampler(random_state=42)

# Define pipelines for oversampling and undersampling
pipeline_oversample = Pipeline([
    ('o', oversample),
    ('m', model)
])

pipeline_undersample = Pipeline([
    ('u', undersample),
    ('m', model)
])

In [44]:
# Fit and evaluate oversampled dataset
pipeline_oversample.fit(X_train, y_train)
y_pred_oversample = pipeline_oversample.predict(X_test)
print("Oversampled Dataset:")
print(classification_report(y_test, y_pred_oversample))
print("Confusion Matrix (Oversampled Dataset):")
print(confusion_matrix(y_test, y_pred_oversample))


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Oversampled Dataset:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.73      0.87      0.79       136

    accuracy                           1.00     85443
   macro avg       0.86      0.93      0.90     85443
weighted avg       1.00      1.00      1.00     85443

Confusion Matrix (Oversampled Dataset):
[[85263    44]
 [   18   118]]


In [45]:
# Fit and evaluate undersampled dataset
pipeline_undersample.fit(X_train, y_train)
y_pred_undersample = pipeline_undersample.predict(X_test)
print("Undersampled Dataset:")
print(classification_report(y_test, y_pred_undersample))
print("Confusion Matrix (Undersampled Dataset):")
print(confusion_matrix(y_test, y_pred_undersample))


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.



Undersampled Dataset:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     85307
           1       0.04      0.93      0.08       136

    accuracy                           0.96     85443
   macro avg       0.52      0.95      0.53     85443
weighted avg       1.00      0.96      0.98     85443

Confusion Matrix (Undersampled Dataset):
[[82315  2992]
 [    9   127]]



    E.g. tree_method = "hist", device = "cuda"



### Analysis of Model Performance on Different Datasets

#### Oversampling
- **Improves Recall:** Oversampling helps improve recall for the minority class (`1`), allowing the model to identify more instances of this class.
- **Balance Between Precision and Recall:** Achieves a better balance between precision and recall compared to the original dataset.
- **Potential Drawback:** May introduce noise and lower precision, which means that while the model becomes better at detecting minority class instances, the number of false positives may increase.

#### Undersampling
- **High Recall for Minority Class:** Results in a model with very high recall for the minority class (`1`), indicating that the model identifies most of the actual minority class instances.
- **Low Precision:** Suffers from extremely low precision, which reduces the model's reliability. In practical scenarios, low precision means that when the model predicts the minority class, it is often incorrect, making it less useful for applications where accurate classification of the minority class is crucial.

#### Original Dataset
- **High Precision and Good Recall:** Achieves high precision and good recall for the minority class (`1`), meaning that the model performs well in correctly identifying instances of the minority class while maintaining a good balance with the majority class.
- **Class Imbalance Issue:** Does not address the underlying class imbalance issue. This might still be problematic in applications where detecting the minority class is critical, as the model might still be biased towards the majority class.
