In [None]:
import os
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# Read the Data

In [None]:
working_directory = os.getcwd()
print(working_directory)
data = pd.read_csv(f"{working_directory}/Input_Data/creditcard_post_correlation.csv") #Change the path to your dataset, if needed

## Define Predictors and Target Variables
##### We will specify the predictor features and the target variable. Additionally, categorical features can be identified if present. In this case, there are no categorical features.



In [None]:
# Define the target variable
target = 'Fraud_Flag'

# Define the features to be used in the model
predictors = [
    'Transaction_Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
    'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
    'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
    'Transaction_Amount'
]

## Define the TRAIN/VALIDATION/TEST SPLIT

In [None]:
#TRAIN/VALIDATION/TEST SPLIT
#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split
TEST_SIZE = 0.20 # test size using_train_test_split

#CROSS-VALIDATION
NUMBER_KFOLDS = 5 #number of KFolds for cross-validation

RANDOM_STATE = 2018

MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop 
OPT_ROUNDS = 1000  #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result

# Set the path to the input data
IS_LOCAL = True  # Set to True since you we running locally

if IS_LOCAL:
    PATH = "C:/Users/teovr/Desktop/Credit_Card_Fraud_Detection_Predictive_Model/Input_Data/"
else:
    PATH = "../input"

print(os.listdir(PATH))  # List the files in the specified directory

## Split data in train, test and validation set

In [None]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(
    data, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, 
    shuffle=True
)

# Further split the training set into training and validation sets
train_df, valid_df = train_test_split(
    train_df, 
    test_size=VALID_SIZE, 
    random_state=RANDOM_STATE, 
    shuffle=True
)

## XGBoost

#### XGBoost is a gradient boosting algorithm

### XGBoost Classifier definitions

In [None]:
RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NUMBER_OF_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier

### Prepare the model
##### initialize the DMatrix objects for training and validation, starting from the datasets  and set some of the parameters used for the model tuning.

In [None]:
# Prepare the train and valid datasets
dtrain = xgb.DMatrix(train_df[predictors], train_df[target].values)
dvalid = xgb.DMatrix(valid_df[predictors], valid_df[target].values)
dtest = xgb.DMatrix(test_df[predictors], test_df[target].values)

#What to monitor (in this case, **train** and **valid**)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

# Set xgboost parameters
xgb_params = {}
xgb_params['objective'] = 'binary:logistic'
xgb_params['eta'] = 0.039
xgb_params['silent'] = True
xgb_params['max_depth'] = 2
xgb_params['subsample'] = 0.8
xgb_params['colsample_bytree'] = 0.9
xgb_params['eval_metric'] = 'auc'
xgb_params['random_state'] = RANDOM_STATE

### Train the model

In [None]:
model = xgb.train(xgb_params, 
                dtrain, 
                MAX_ROUNDS, 
                watchlist, 
                early_stopping_rounds=EARLY_STOP, 
                maximize=True, 
                verbose_eval=VERBOSE_EVAL)

##### The best validation score (ROC-AUC) was 0.98039, for round 100.

### Features importance

In [None]:
# Make predictions
fig, ax = plt.subplots(figsize=(10, 6))
xgb.plot_importance(
    model,
    ax=ax,
    height=0.8,
    title="Feature Importance (XGBoost)",
    color="forestgreen",
    importance_type='gain',  
    show_values=False,       
)

# Add value labels to bars 
ax.set_title("XGBoost Feature Importance", fontsize=18, fontweight='bold', pad=15)
ax.set_xlabel("Importance Score", fontsize=14)
ax.set_ylabel("Features", fontsize=14)

# Plot the feature importance
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()


### Predict test set

In [None]:
predictions4 = model.predict(dtest)

### Confusion Matrix

In [None]:
# Convert predicted probabilities to class labels (0 or 1)
xgb_pred_labels = (predictions4 >= 0.5).astype(int)

# Create the confusion matrix
cm_xgb = confusion_matrix(test_df[target].values, xgb_pred_labels)

# Plot the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(
    cm_xgb,
    annot=True,
    fmt='d',
    cmap="Blues",
    linewidths=1,
    linecolor="black",
    cbar=False,
    xticklabels=['Not Fraud', 'Fraud'],
    yticklabels=['Not Fraud', 'Fraud'],
    annot_kws={"size": 18, "weight": "bold"}
)

# Add title and labels
plt.title('XGBoost Confusion Matrix', fontsize=18, fontweight='bold', pad=15)
plt.xlabel('Predicted', fontsize=14, labelpad=10)
plt.ylabel('Actual', fontsize=14, labelpad=10)
plt.tight_layout()
plt.show()

### ROC-AUC score (Area under curve)

In [None]:
roc_auc_score(test_df[target].values, predictions4)

##### The AUC score for the prediction of fresh data (test set) is 0.974.