In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Step 1: load the validated datasets
print("Loading the validated datasets...")
train_final = pd.read_csv('engineered_data/fraudTrain_engineered.csv')
test_final = pd.read_csv('engineered_data/fraudTest_engineered.csv')

Loading the validated datasets...


In [None]:
# Step 2: prepare features and target
# encode gender using one-hot encoding
train_final = pd.get_dummies(train_final, columns=['gender'], drop_first=True)
test_final = pd.get_dummies(test_final, columns=['gender'], drop_first=True)

In [None]:
# define features 
features = [col for col in train_final.columns if col not in ['is_fraud']]
X_train = train_final[features]
y_train = train_final['is_fraud']
X_test = test_final[features]
y_test = test_final['is_fraud']

In [None]:
# Step 3: define LightGBM parameters
params = {
    'objective': 'binary',  # Binary classification
    'metric': 'binary_logloss',  # Metric for evaluation
    'scale_pos_weight': len(y_train[y_train == 0]) / len(y_train[y_train == 1]),  # Handle imbalance
    'random_state': 42
}

In [None]:
# Step 4: create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
# Step 5: train the LightGBM model
print("Training the LightGBM model...")
lgb_model = lgb.train(params, train_data, num_boost_round=100)

Training the LightGBM model...
[LightGBM] [Info] Number of positive: 7506, number of negative: 1289169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1196
[LightGBM] [Info] Number of data points in the train set: 1296675, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005789 -> initscore=-5.146050
[LightGBM] [Info] Start training from score -5.146050


In [None]:
# Step 6: get predicted probabilities
print("Generating predicted probabilities...")
y_pred_prob = lgb_model.predict(X_test)  

Generating predicted probabilities...


In [None]:
# Step 7: test different thresholds
thresholds = [0.8, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99]
for thresh in thresholds:
    y_pred = (y_pred_prob > thresh).astype(int)
    print(f"\nThreshold: {thresh}")
    print(classification_report(y_test, y_pred))


Threshold: 0.8
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.14      0.90      0.24      2145

    accuracy                           0.98    555719
   macro avg       0.57      0.94      0.62    555719
weighted avg       1.00      0.98      0.99    555719


Threshold: 0.9
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.16      0.88      0.27      2145

    accuracy                           0.98    555719
   macro avg       0.58      0.93      0.63    555719
weighted avg       1.00      0.98      0.99    555719


Threshold: 0.95
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.16      0.87      0.28      2145

    accuracy                           0.98    555719
   macro avg       0.58      0.93      0.63    555719
weighted avg       1.00  