In [None]:
import os
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# Read the Data

In [None]:
working_directory = os.getcwd()
print(working_directory)
data = pd.read_csv(f"{working_directory}/Input_Data/creditcard_post_correlation.csv") #Change the path to your dataset, if needed

## Define Predictors and Target Variables
##### We will specify the predictor features and the target variable. Additionally, categorical features can be identified if present. In this case, there are no categorical features.



In [None]:
# Define the target variable
target = 'Fraud_Flag'

# Define the features to be used in the model
predictors = [
    'Transaction_Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
    'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
    'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
    'Transaction_Amount'
]

## Define the TRAIN/VALIDATION/TEST SPLIT

In [None]:
#TRAIN/VALIDATION/TEST SPLIT
#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split
TEST_SIZE = 0.20 # test size using_train_test_split

#CROSS-VALIDATION
NUMBER_KFOLDS = 5 #number of KFolds for cross-validation

RANDOM_STATE = 2018

MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop 
OPT_ROUNDS = 1000  #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result

# Set the path to the input data
IS_LOCAL = True  # Set to True since you we running locally

if IS_LOCAL:
    PATH = "C:/Users/teovr/Desktop/Credit_Card_Fraud_Detection_Predictive_Model/Input_Data/"
else:
    PATH = "../input"

print(os.listdir(PATH))  # List the files in the specified directory

## Split data in train, test and validation set

In [None]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(
    data, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, 
    shuffle=True
)

# Further split the training set into training and validation sets
train_df, valid_df = train_test_split(
    train_df, 
    test_size=VALID_SIZE, 
    random_state=RANDOM_STATE, 
    shuffle=True
)

## LightGBM

#### Continue testing with another gradient boosting algorithm, LightGBM.

### LightGBM definitions

In [None]:
RFC_METRIC = 'gini'  #metric used for RandomForrestClassifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForrestClassifier
NUMBER_OF_JOBS = 4 #number of parallel jobs used for RandomForrestClassifier

### Define model parameters

In [None]:
# Set the LightGBM parameters
lgbm_parameters = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric':'auc',
          'learning_rate': 0.05,
          'num_leaves': 7,  # we should let it be smaller than 2^(max_depth)
          'max_depth': 4,  # -1 means no limit
          'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
          'max_bin': 100,  # Number of bucketed bin for feature values
          'subsample': 0.9,  # Subsample ratio of the training instance.
          'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
          'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
          'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
          'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
          'nthread': 8,
          'verbose': 0,
          'scale_pos_weight':150, # because training data is extremely unbalanced 
         }

### Prepare the model

In [None]:
# Create the training dataset
dtrain = lgb.Dataset(train_df[predictors].values, 
                     label=train_df[target].values,
                     feature_name=predictors)

# Create the validation dataset
dvalid = lgb.Dataset(valid_df[predictors].values,
                     label=valid_df[target].values,
                     feature_name=predictors)

### Train the model

In [None]:
# Track evaluation results for both training and validation sets
eval_result = {}

# Create a callback to record evaluation metrics
record_cb = lgb.record_evaluation(eval_result)

# Train the LightGBM model with verbose evaluation every 50 rounds
model = lgb.train(
    lgbm_parameters,
    dtrain,
    valid_sets=[dtrain, dvalid],
    valid_names=['train', 'eval'],
    callbacks=[record_cb, lgb.early_stopping(stopping_rounds=EARLY_STOP), lgb.log_evaluation(period=VERBOSE_EVAL)]
)

# Print a summary of the best validation AUC
best_iter = model.best_iteration
best_auc = eval_result['eval']['auc'][best_iter - 1]
print(f"Best iteration: {best_iter}")
print(f"Best validation AUC: {best_auc:.5f}")

##### Best validation score was obtained for round 29, for which AUC ~= 0.95786.

### Features importance

In [None]:
# Plot LightGBM feature importance with improved style and readability
fig, ax = plt.subplots(figsize=(10, 6))
lgb.plot_importance(
    model,
    ax=ax,
    height=0.8,
    title="Feature Importance (LightGBM)",
    color="forestgreen",
    importance_type='gain',  # 'gain' is often more informative than 'split'
    max_num_features=15,     # Show top 15 features for clarity
    xlabel="Importance Score"
)

# Add value labels to bars
ax.set_xlabel("Importance Score", fontsize=14)
ax.set_ylabel("Feature", fontsize=14)
ax.set_title("Top Feature Importances (LightGBM)", fontsize=18, fontweight='bold', pad=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

### Predict test data

In [None]:
predictions5 = model.predict(test_df[predictors])

### ROC-AUC score (Area under curve)

In [None]:
roc_auc_score(test_df[target].values, predictions5)

##### The ROC-AUC score obtained for the test set is 0.947.