In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier

# Read the Data

In [None]:
working_directory = os.getcwd()
print(working_directory)
data = pd.read_csv(f"{working_directory}/Input_Data/creditcard_post_correlation.csv") #Change the path to your dataset, if needed

## Define Predictors and Target Variables
##### We will specify the predictor features and the target variable. Additionally, categorical features can be identified if present. In this case, there are no categorical features.



In [None]:
# Define the target variable
target = 'Fraud_Flag'

# Define the features to be used in the model
predictors = [
    'Transaction_Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
    'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
    'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
    'Transaction_Amount'
]

## Define the TRAIN/VALIDATION/TEST SPLIT

In [None]:
#TRAIN/VALIDATION/TEST SPLIT
#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split
TEST_SIZE = 0.20 # test size using_train_test_split

#CROSS-VALIDATION
NUMBER_KFOLDS = 5 #number of KFolds for cross-validation

RANDOM_STATE = 2018

MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop 
OPT_ROUNDS = 1000  #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result

# Set the path to the input data
IS_LOCAL = True  # Set to True since we are running locally

if IS_LOCAL:
    PATH = "C:/Users/teovr/Desktop/Credit_Card_Fraud_Detection_Predictive_Model/Input_Data/"
else:
    PATH = "../input"

print(os.listdir(PATH))  # List the files in the specified directory

## Split data in train, test and validation set

In [None]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(
    data, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, 
    shuffle=True
)

# Further split the training set into training and validation sets
train_df, valid_df = train_test_split(
    train_df, 
    test_size=VALID_SIZE, 
    random_state=RANDOM_STATE, 
    shuffle=True
)

## AdaBoost Classifier (Adaptive Boosting Classifier)

### AdaBoost Classifier definitions

In [None]:
RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NUMBER_OF_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier

### Create the model

##### Set the parameters for the model and initialize the it.

In [None]:
clf2 = AdaBoostClassifier(random_state=RANDOM_STATE,
                         algorithm='SAMME',
                         learning_rate=0.8,
                             n_estimators=NUM_ESTIMATORS)

### Fitting the model

In [None]:
clf2.fit(train_df[predictors], train_df[target].values)

### Predict the target values

In [None]:
predictions2 = clf2.predict(valid_df[predictors])

### Features importance

In [None]:
# Create a DataFrame to store feature importance
feature_importance_df2 = pd.DataFrame({'Feature': predictors, 'Feature importance': clf2.feature_importances_})
feature_importance_df2 = feature_importance_df2.sort_values(by='Feature importance', ascending=True)  # Ascending for horizontal barh

# Plot the feature importance
plt.figure(figsize=(10, 8))
plt.title('Feature Importance (AdaBoost)', fontsize=18, fontweight='bold', pad=15)
s = sns.barplot(
    x='Feature importance',
    y='Feature',
    data=feature_importance_df2,
    hue='Feature',           
    palette='crest',
    legend=False             
)

# Add value labels to bars
for i, v in enumerate(feature_importance_df2['Feature importance']):
    s.text(v + 0.001, i, f"{v:.3f}", color='black', va='center', fontsize=10)

# Set legend
plt.xlabel('Importance', fontsize=14, labelpad=10)
plt.ylabel('Features', fontsize=14, labelpad=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

### Confusion Matrix

In [None]:
# Create a DataFrame to store feature importance
cm2 = pd.crosstab(valid_df[target].values, predictions2, rownames=['Actual'], colnames=['Predicted'])

# Create the plot
plt.figure(figsize=(6, 6))
ax = sns.heatmap(
    cm2,
    annot=True,
    fmt='d',
    cmap="Blues",
    linewidths=1,
    linecolor="black",
    cbar=False,
    xticklabels=['Not Fraud', 'Fraud'],
    yticklabels=['Not Fraud', 'Fraud'],
    annot_kws={"size": 18, "weight": "bold"}
)

# Add title and labels
ax.set_xlabel('Predicted', fontsize=16, labelpad=15)
ax.set_ylabel('Actual', fontsize=16, labelpad=15)
ax.set_title('Confusion Matrix', fontsize=18, fontweight='bold', pad=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14, rotation=0)
plt.tight_layout()
plt.show()

### ROC-AUC score (Area under curve)

In [None]:
roc_auc_score(valid_df[target].values, predictions2)

##### The ROC-AUC score obtained with RandomForrestClassifier is 0.81.