In [1]:
from IPython.display import HTML
HTML('''<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js "></script><script>
code_show=true; 
function code_toggle() {
if (code_show){
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').hide();
} else {
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').show();
}
code_show = !code_show
} 
$( document ).ready(code_toggle);</script><form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold
)

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    average_precision_score,
    make_scorer
)

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

from model import FraudDetector

# Notebook autoreload
%load_ext autoreload
%autoreload 2

# For reproducibility
RANDOM_STATE = 39

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [100]:
data = pd.read_csv("historical.csv")
target = data["outcome"]

<div style="
    background: url('https://imgur.com/gtF7pEr.png') no-repeat center center; 
    background-size: cover;
    height: 1000px;
    display: flex;
    flex-direction: column;
    justify-content: center;
    align-items: center;
    text-align: center;
    color: white;
    padding: 20px;
">
    <h1 style="font-size: 50px; font-weight: bold; margin: 10px;"></h1>
    <h2 style="font-size: 30px; font-style: italic; margin: 10px;"></h2>
</div>

<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">ABSTRACT</h3>
</div>

<div style="font-family: 'Poppins', sans-serif; font-size: 15px; border-bottom: 2px solid #2a475e; padding-bottom: 15px">
<p style="line-height: 1.5; text-align: justify">
aaaaaaaaa
</p>
</div>

<div style="border-bottom: 1px solid #b6c6de; box-shadow: 0px 1px 0px #66c0f4; padding-bottom: 7px;">
</div>

<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">INTRODUCTION</h3>
</div>

<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">METHODOLOGY</h3>
</div>

<div style="text-align: center;">
    <img src="https://imgur.com/5MtrMTE.png" alt="Cover" width="200">
</div>

<div style="font-family: 'Poppins', sans-serif; font-size: 15px; padding-bottom: 25px">

  <p style="line-height: 1.5; text-align: justify">
    <b>Case Facts</b><br>
    This lab follows a methodology framework with the objective to develop a proof-of-concept model for credit card fraud detection to address the shortcomings of the existing rule-based system. The team is tasked to directly address the core technical challenge with the data being severely imbalanced. Following Mara's approach, the team will prioritize a high detection rate (recall) and fraud capture rate (FCR).
  </p>

  <p style="line-height: 1.5; text-align: justify">
    <b>Exploratory Data Analysis</b><br>
    The team is already provided with the data used by CreditByte. When opening the provided csv file, the dataset contains: transaction id, V1-V28, Amount, and Outcome. V1-V28 are principal components from a PCA operation done to mask PIIs. The 'Outcome' column indicates whether the transaction is fraudulent (1) or not (0). The team will analyze the distributions of these features to identify initial patterns that may lead to additional insights.
  </p>

  <p style="line-height: 1.5; text-align: justify">
    <b>Data Preprocessing</b><br>
    In this phase, the team has selected and tested several different sampling methods to preprocess the data and address the severe class imbalance. The sampling methods that will be focused on are: SMOTE, SMOTE + Tomek, ADASYN, ADASYN + Tomek, Random Undersampling, Random Oversampling and Class Weights. These seven sampling methods are tested through various metrics and are tuned in order to maximize effectiveness.
  </p>

  <p style="line-height: 1.5; text-align: justify">
    <b>Modeling & Tuning</b><br>
    The Modeling & Tuning phase of the project uses Random Forest classifier with hyperparameter tuning and applying GridSearch on a small parameter grid. This allows the team to focus more on the sampling methods. Through using GridSearch with the various parameters of these sampling methods, the team is able to find the optimal set of parameters that will satisfy the objectives of this project. The objective remains to maximize recall and FCR.
  </p>

  <p style="line-height: 1.5; text-align: justify">
    <b>Insights</b><br>
    The main metrics these sampling methods will be tested on are the recall and the FCR achieved by the different sampling methods. Accompanying these are additional metrics which are: Precision, F1, and Net Savings. These performance metrics will help justify the recommended sampling strategy. A mix of business and technical metrics will help solidify our choice of sampling method.
  </p>

</div>

<div style="padding-bottom: 7px;"></div>


<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">EXPLORATORY DATA ANALYSIS</h3>
</div>

In [33]:
value_counts = data["outcome"].value_counts()
total = len(data)
'''
plt.figure(figsize=(8,6))

ax = sns.barplot(
    x=value_counts.index.map({0:"No Fraud", 1:"Fraud"}), 
    y=value_counts.values, 
    hue=value_counts.index.map({0:"No Fraud", 1:"Fraud"}),  # assign hue
    palette="pastel",
)
'''

# Add percentage labels on top of bars
for i, count in enumerate(value_counts.values):
    percent = 100 * count / total
    ax.text(
        i, count + 1000, f"{percent:.2f}%", 
        ha='center', va='bottom', fontsize=12, fontweight='bold'
    )


#plt.title("Value Counts of Outcome")
#plt.ylabel("Count")
#plt.xlabel("Outcome")
#plt.show()

<div style="text-align: center;">
    <img src="https://imgur.com/jPCJSxy.png" alt="Boxplots of Numerical Features" width="700">
</div>

<div style="text-align: center;">
<p style="text-align: center; font-size: 14px; margin-bottom: 30px; margin-left: 100px; font-style: italic;">
    Figure 1. Count of fraudulent and non-fraudulent cases in data set
</p>
</div>

<div style="font-family: 'Poppins', sans-serif; font-size: 15px; padding-bottom: 25px">
<p style="line-height: 1.5; text-align: justify">
Figure 1 shows the count of the number of fraudulent and non-fraudulent cases within the data set. According to the bar graph only 0.2% of the data are fradulent cases. Therefore, telling us that there is a high class imbalance. An imbalance data set leads to misleading accuracies,  biased learning, and poor generalization. In other words, it makes it harder for models to identify the patterns of the minority class. Moving forward, the team will explore different methods of countering this imbalance.
</p>                                                                                                                                                           
</div>

<div style="padding-bottom: 7px;">
</div>

In [None]:
corr_all = data.drop(columns=["tid"]).corr()

# Split by outcome
corr_outcome1 = data[data["outcome"] == 1].drop(columns=["tid"]).corr()
corr_outcome0 = data[data["outcome"] == 0].drop(columns=["tid"]).corr()

# Plot
#fig, axes = plt.subplots(1, 3, figsize=(24, 8))

#sns.heatmap(corr_all, annot=False, cmap="coolwarm", linewidths=0.5, ax=axes[0])
axes[0].set_title("Correlation Heatmap (All Data)", fontsize=14)

#sns.heatmap(corr_outcome1, annot=False, cmap="coolwarm", linewidths=0.5, ax=axes[1])
axes[1].set_title("Correlation Heatmap (Outcome = 1)", fontsize=14)

#sns.heatmap(corr_outcome0, annot=False, cmap="coolwarm", linewidths=0.5, ax=axes[2])
axes[2].set_title("Correlation Heatmap (Outcome = 0)", fontsize=14)

#plt.tight_layout()
#plt.show()

<div style="text-align: center;">
    <img src="https://imgur.com/KvRIeQw.png" alt="Boxplots of Numerical Features" width="1600">
</div>

<div style="text-align: center;">
<p style="text-align: center; font-size: 14px; margin-bottom: 30px; margin-left: 20px; font-style: italic;">
    Figures 2-4. Correlation Heatmaps of PCA Components
</p>
</div>

<div style="font-family: 'Poppins', sans-serif; font-size: 15px; padding-bottom: 25px">
<p style="line-height: 1.5; text-align: justify">
The first correlation Map shows the correlation of all features with one another, while the others show the correlation of each features for fraudulent cases and non-fraudulent cases individually. Looking at the first correlation heatmap, the various PCAs seem to have little to no correlation to one another. However, when isolating the fraud data points, it reveals the multiple PCAs have strong correlations to one another. Therefore, indicating that the iambalance data makes it difficult to identify patterns within the minority class.
</p>                                                                                                                                                           
</div>

<div style="padding-bottom: 7px;">
</div>

In [None]:
subset = data[[f"V{i}" for i in range(1, 4)] + ["outcome"]]

# Create pairplot
'''
sns.pairplot(
    data=subset,
    vars=[f"V{i}" for i in range(1, 4)],
    hue="outcome",
    palette="Set1",   # you can try "coolwarm", "husl", etc.
    diag_kind="kde",  # kde or hist for diagonal plots
    plot_kws={'alpha':0.6, 's':15}  # make points smaller/transparent for readability
)
'''
#plt.suptitle("Pairplot of V1–3 Colored by Outcome", y=1.02)
#plt.show()

<div style="text-align: center;">
    <img src="https://imgur.com/kH8HZxH.png" alt="Boxplots of Numerical Features" width="700">
</div>

<div style="text-align: center;">
<p style="text-align: center; font-size: 14px; margin-bottom: 30px; margin-left: 20px; font-style: italic;">
    Figure 5. Pairplot of PCA Components V1-3 by Outcome
</p>
</div>

<div style="font-family: 'Poppins', sans-serif; font-size: 15px; padding-bottom: 25px">
<p style="line-height: 1.5; text-align: justify">
Figure 5 presents the relationship between the different principal components, while highlighting the fraudulent and non-fraudulent cases. Based on the pair plots, the fraud data points do not uniformly mix with the non-fraud data points. It forms it’s own clusters and bands, which means that the different principal components are able to identify significant patterns, showing distinct feature interactions. This is promising because it means the features do contain information to distinguish fraud, but the team needs to implement more than a simple linear model to capture it.
</p>                                                                                                                                                           
</div>

<div style="padding-bottom: 7px;">
</div>

In [123]:
#plt.figure(figsize=(10,6))
#plt.hist(data['Amount'], bins=100, edgecolor='black', alpha=0.5)
#plt.xlabel("Transaction Amount")
#plt.ylabel("Frequency")
#plt.title("Distribution of Transaction Amounts")

# Log scale
#plt.yscale("log")

#plt.show()


<div style="text-align: center;">
    <img src="https://imgur.com/3i2TSp3.png" alt="Boxplots of Numerical Features" width="700">
</div>

<div style="text-align: center;">
<p style="text-align: center; font-size: 14px; margin-bottom: 30px; margin-left: 20px; font-style: italic;">
    Figure 6. Histogram of Distribution of Transaction Amounts
</p>
</div>

<div style="font-family: 'Poppins', sans-serif; font-size: 15px; border-bottom: 2px solid #2a475e; padding-bottom: 15px">
<p style="line-height: 1.5; text-align: justify">
Figure 6 illustrates the highly right-skewed distribution of transaction amounts in the dataset. The majority of transactions are concentrated at relatively low amounts, as shown by the dense cluster between 0 and 5000 units, while a long tail extends to larger values. This imbalance demonstrates that models that miss even a few high-value frauds can lead to significant financial losses.
</p>
</div>

<div style="border-bottom: 1px solid #b6c6de; box-shadow: 0px 1px 0px #66c0f4; padding-bottom: 7px;">
</div>

<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">RESULTS AND DISCUSSION</h3>
</div>

In [None]:
def net_savings(y_true, y_pred, amounts):
    """
    Compute Net Savings = sum(TP amounts) - sum(FN amounts)
    
    Parameters:
    -----------
    y_true : array-like, true labels
    y_pred : array-like, predicted labels
    amounts : array-like, transaction amounts aligned with y_true
    
    Returns:
    --------
    float : net savings
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    amounts = np.array(amounts)
    
    # True Positives: fraud predicted as fraud
    tp_amount = amounts[(y_true == 1) & (y_pred == 1)].sum()
    
    # False Negatives: fraud predicted as non-fraud
    fn_amount = amounts[(y_true == 1) & (y_pred == 0)].sum()
    
    return tp_amount - fn_amount

def run_sampler_gridsearch(name, pipeline, param_grid, 
                           X_train, y_train, X_test, y_test, amt_test, 
                           results, cv=None, random_state=RANDOM_STATE):
    """
    Run GridSearchCV for a given sampler/classifier pipeline,
    optimize for Fraud Recall (FCR), evaluate on test set, 
    and append results.
    """

    # Define scorer for fraud recall
    fcr_scorer = make_scorer(recall_score, pos_label=1)
    
    if cv is None:
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
        
    # Grid search
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=fcr_scorer,   # optimize for FCR
        cv=cv,
        n_jobs=-1,
        verbose=2
    )
    grid_search.fit(X_train, y_train)
    
    # Best model
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Predictions on test set
    y_pred_test = best_model.predict(X_test)
    
    # Metrics
    precision = precision_score(y_test, y_pred_test)
    recall_macro = recall_score(y_test, y_pred_test, average='macro')
    f1 = f1_score(y_test, y_pred_test)
    fcr = recall_score(y_test, y_pred_test, pos_label=1) * 100
    net_sav = net_savings(y_test, y_pred_test, amt_test)
    
    # Append to results
    results.append({
        "Method": name,
        "Best Params": best_params,
        "Precision": precision,
        "Recall (Macro)": recall_macro,
        "F1": f1,
        "FCR (%)": fcr,
        "Net Savings": net_sav
    })
    
    return results

In [None]:
amounts = data['Amount'].copy()
X = data.drop(columns=['outcome', 'tid'])
y = data['outcome']

X_train_full, X_test, y_train_full, y_test, amt_train_full, amt_test = train_test_split(
    X, y, amounts, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

X_train, X_val, y_train, y_val, amt_train, amt_val = train_test_split(
    X_train_full, y_train_full, amt_train_full,
    test_size=0.25, random_state=RANDOM_STATE, stratify=y_train_full
)

In [117]:
results = []

# Baseline
pipeline_rf = Pipeline([
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE))
])

param_grid_rf = {}

results = run_sampler_gridsearch("Random Forest (untuned)", pipeline_rf, param_grid_rf, 
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# SMOTE
pipeline_smote = Pipeline([
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=500, n_jobs=-1))
])

param_grid_smote = {
    'smote__sampling_strategy': [0.2],
    'smote__k_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("SMOTE", pipeline_smote, param_grid_smote,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# SMOTE + Tomek
pipeline_smote_tomek = Pipeline([
    ('smote_tomek', SMOTE(random_state=RANDOM_STATE)),
    ('tomek', TomekLinks()),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=500, n_jobs=-1))
])

param_grid_smote_tomek = {
    'smote_tomek__sampling_strategy': [0.2],
    'smote_tomek__k_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("SMOTE + Tomek", pipeline_smote_tomek, param_grid_smote_tomek,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# ADASYN
pipeline_adasyn = Pipeline([
    ('adasyn', ADASYN(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=500, n_jobs=-1))
])

param_grid_adasyn = {
    'adasyn__sampling_strategy': [0.2],
    'adasyn__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("ADASYN", pipeline_adasyn, param_grid_adasyn,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)




# ADASYN + Tomek
pipeline_adasyn_tomek = Pipeline([
    ('adasyn_tomek', ADASYN(random_state=RANDOM_STATE)),
    ('tomek', TomekLinks()),
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_adasyn_tomek = {
    'adasyn_tomek__sampling_strategy': [0.2],
    'adasyn_tomek__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("ADASYN + Tomek", pipeline_adasyn_tomek, param_grid_adasyn_tomek,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# Random Undersampling
pipeline_under = Pipeline([
    ('under', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_under = {}

results = run_sampler_gridsearch("Random Undersampling", pipeline_under, param_grid_under,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# Random Oversampling
pipeline_over = Pipeline([
    ('over', RandomOverSampler(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_over = {}

results = run_sampler_gridsearch("Random Oversampling", pipeline_over,
    param_grid_over,
    X_train, y_train, X_test, y_test, amt_test,
    results, random_state=RANDOM_STATE)



# Class weighting
weights = np.linspace(0, 100, 5)

pipeline_cw = Pipeline([
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_cw = {
    'clf__class_weight': [{0: w0, 1: 100 - w0} for w0 in weights]
}

results = run_sampler_gridsearch("Class Weights", pipeline_cw, param_grid_cw,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END .................................................... total time=  25.1s
[CV] END .................................................... total time=  25.6s
[CV] END .................................................... total time=  26.0s
Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] END .smote__k_neighbors=3, smote__sampling_strategy=0.2; total time= 3.2min
[CV] END .smote__k_neighbors=5, smote__sampling_strategy=0.2; total time= 3.5min
[CV] END .smote__k_neighbors=3, smote__sampling_strategy=0.2; total time= 3.6min
[CV] END .smote__k_neighbors=5, smote__sampling_strategy=0.2; total time= 3.7min
[CV] END .smote__k_neighbors=3, smote__sampling_strategy=0.2; total time= 3.7min
[CV] END .smote__k_neighbors=9, smote__sampling_strategy=0.2; total time= 3.9min
[CV] END .smote__k_neighbors=9, smote__sampling_strategy=0.2; total time= 3.9min
[CV] END .smote__k_neighbors=7, smote__sampling_strategy=0.2; total tim

In [118]:
results_df_final = pd.DataFrame(results)
results_df_final.to_csv('results_amount.csv', index=False)
results_df_final

Unnamed: 0,Method,Best Params,Precision,Recall (Macro),F1,FCR (%),Net Savings
0,Random Forest (untuned),{},0.918367,0.851508,0.79646,70.3125,-670.45
1,SMOTE,"{'smote__k_neighbors': 13, 'smote__sampling_st...",0.825397,0.906101,0.818898,81.25,688.95
2,SMOTE + Tomek,"{'smote_tomek__k_neighbors': 11, 'smote_tomek_...",0.825397,0.906101,0.818898,81.25,688.95
3,ADASYN,"{'adasyn__n_neighbors': 15, 'adasyn__sampling_...",0.8125,0.906088,0.8125,81.25,688.95
4,ADASYN + Tomek,"{'adasyn_tomek__n_neighbors': 13, 'adasyn_tome...",0.8125,0.906088,0.8125,81.25,688.95
5,Random Undersampling,{},0.057436,0.925068,0.107796,87.5,4771.63
6,Random Oversampling,{},0.923077,0.874946,0.827586,75.0,-430.41
7,Class Weights,"{'clf__class_weight': {0: 0.0, 1: 100.0}}",0.001729,0.5,0.003451,100.0,7177.73


 <center>
    <h2>Summary of Results</h2>
</center>

<div align="center">

| Machine Learning Method | Precision | Recall (Macro) | F1 Score | FCR (%) | Net Savings |
| :---: | :---: | :---: | :---: | :---: | :---: |
| Random Forest (untuned) | 0.918 | 0.852 | 0.796 | 70.31 | -670.45 |
| SMOTE | 0.825 | 0.906 | 0.819 | 81.25 | 688.95 |
| SMOTE + Tomek | 0.825 | 0.906 | 0.819 | 81.25 | 688.95 |
| ADASYN | 0.813 | 0.906 | 0.813 | 81.25 | 688.95 |
| ADASYN + Tomek | 0.813 | 0.906 | 0.813 | 81.25 | 688.95 |
| Random Undersampling | 0.057 | 0.925 | 0.108 | 87.50 | 4771.63 |
| Random Oversampling | 0.923 | 0.875 | 0.828 | 75.00 | -430.41 |
| Class Weights | 0.002 | 0.500 | 0.003 | 100.0 | 7177.73 |

</div>



<div style="font-family: 'Poppins', sans-serif; font-size: 15px; border-bottom: 2px solid #2a475e; padding-bottom: 15px">
<p style="line-height: 1.5; text-align: justify">
To evaluate different strategies for handling the imbalance of fraudulent cases in the data set, a Random Forest classifier was trained with various resampling and weighting methods. The performance was measured using both machine learning metrics: precision, recall, and F1 score, and CreditByte’s domain-specific metrics: detection rate and fraud capture rate (FCR). Detection rate is equivalent to recall, which indicates the proportion of fraud cases correctly identified. FCR calculates the percentage of all fraud transactions flagged by the model. Additionally, net savings was computed to quantify business impact, representing the financial benefit of prevented fraud minus the losses from missed fraud.
</p>

<p style="line-height: 1.5; text-align: justify">
The baseline Random Forest achieved strong precision of 0.91 and recall of 0.85, but still produced a negative net savings. This would suggest that while the model caught most fraudulent cases, the few high-value frauds it missed outweighed the savings from correctly detected ones. Applying oversampling methods such as SMOTE and ADASYN traded a slight decrease in precision for an improvent in all other metrics, with consistent positive net savings.
</p>

<p style="line-height: 1.5; text-align: justify">
Random undersampling produced the highest detection rate with recall reaching 0.93 and an FCR of 87.5%. From a financial perspective, this method returned 7 times the net savings of SMOTE and ADASYN. However, the significant savings came at the cost of a steep decline in precision of 0.06. This may indicate that the model underfits the majority class, while over-predicting fraud; The model increasingly flags legitimate transactions as fraudulent as it hasn't learned enough what a non-fraudulent transaction is.
</p>

<p style="line-height: 1.5; text-align: justify">
Random oversampling produced a recall of 0.87 and an FCR of 75.0%, which were improvements over the baseline Random Forest, but fell short in comparison to the balanced results of the synthetic oversampling methods. Precision remained relatively high at 0.92, but the financial outcome was negative with net savings of –430.41. Even if oversampling preserved the information from the majority class and helped the model avoid a severe dip in precision, it also introduced the risk of overfitting to repeated minority samples. Therefore, the model performed better at distinguishing legitimate transactions from fraud, yet failed to capture enough fraudulent cases to be of any positive business value.
</p>

<p style="line-height: 1.5; text-align: justify">
The class weighting showed the most extreme trade-off by forcing the model to prioritize fraud detection at all costs, hence the FCR of 100%, but precision, recall, and F1 score collapsed. Although this resulted in the largest calculated savings, such a model would overwhelm CreditByte with false positives.
</p>

<p style="line-height: 1.5; text-align: justify">
In summary, oversampling strategies such as SMOTE and ADASYN show the best balance between detection and precision, and has financial viability from the positive net savings. Meanwhile, undersampling and class weighting maximize fraud capture and financial returns but is inefficient at correctly tagging fraud cases.
</p>

<p style="line-height: 1.5; text-align: justify">
<b>The team recommends SMOTE</b> as the method for handling the imbalanced data set due to consistent results in detection and fraud capture, while generating positive net savings, making it the most defensible option both a technical and business standpoint. Over time, hybrid approaches such as SMOTE with Tomek Links may further improve performance by balancing fraud detection with reduced false positives.
</p>


</div>

<div style="border-bottom: 1px solid #b6c6de; box-shadow: 0px 1px 0px #66c0f4; padding-bottom: 7px;">
</div>

<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">CONCLUSION AND RECOMMENDATIONS</h3>
</div>

<div style="font-family: 'Poppins', sans-serif; font-size: 15px; border-bottom: 2px solid #2a475e; padding-bottom: 15px">
<p style="line-height: 1.5; text-align: justify">
In conclusion, the lab demonstrates the importance of handling class imbalance in fraud detection to improve technical performance and business profitability. Among the methods tested, undersampling and class weighting achieved the highest fraud capture but was outweighed by the drawbacks from low precision. SMOTE and other oversampling methods had the best balance and yielded consistent improvements in recall and FCR while maintaining financial viability through the positive net savings. These results demonstrate that SMOTE is the most appropriate technique for this dataset, and future works with hybrid methods may enhance model effectiveness.
</p>
</div>

<div style="border-bottom: 1px solid #b6c6de; box-shadow: 0px 1px 0px #66c0f4; padding-bottom: 7px;">
</div>

<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">SUPPLEMENTARY MATERIALS</h3>
</div>

<div style="padding: 0px; border-radius: 5px;">
    <h5 style="color: #215880; font-size: 20px; font-weight: bold; margin: 10px;">Applied pre-processing by scaling; Did not improve performance</h5>
</div>

In [101]:
# needed for cores in macbook
import os, multiprocessing

n_logical = multiprocessing.cpu_count()

n_logical = os.cpu_count()   # same as multiprocessing.cpu_count()
os.environ["LOKY_MAX_CPU_COUNT"] = str(n_logical)
print(f"LOKY_MAX_CPU_COUNT set to {n_logical}")


LOKY_MAX_CPU_COUNT set to 12


In [105]:
# Keep Amount separately for net_savings
amounts = data['Amount'].copy()

# Features: keep Amount inside X (so model can use it)
X = data.drop(columns=['outcome', 'tid'])
y = data['outcome']

# Split, carrying amounts separately for evaluation
X_train_full, X_test, y_train_full, y_test, amt_train_full, amt_test = train_test_split(
    X, y, amounts, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

X_train, X_val, y_train, y_val, amt_train, amt_val = train_test_split(
    X_train_full, y_train_full, amt_train_full,
    test_size=0.25, random_state=RANDOM_STATE, stratify=y_train_full
)

In [106]:
results = []

# Baseline
pipeline_rf = Pipeline([
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE))
])

param_grid_rf = {}

results = run_sampler_gridsearch("Random Forest (untuned)", pipeline_rf, param_grid_rf, 
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# SMOTE
pipeline_smote = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=500, n_jobs=-1))
])

param_grid_smote = {
    'smote__sampling_strategy': [0.2],
    'smote__k_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("SMOTE", pipeline_smote, param_grid_smote,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# SMOTE + Tomek
pipeline_smote_tomek = Pipeline([
    ('scaler', StandardScaler()),
    ('smote_tomek', SMOTE(random_state=RANDOM_STATE)),
    ('tomek', TomekLinks()),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=500, n_jobs=-1))
])

param_grid_smote_tomek = {
    'smote_tomek__sampling_strategy': [0.2],
    'smote_tomek__k_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("SMOTE + Tomek", pipeline_smote_tomek, param_grid_smote_tomek,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# ADASYN
pipeline_adasyn = Pipeline([
    ('scaler', StandardScaler()),
    ('adasyn', ADASYN(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=500, n_jobs=-1))
])

param_grid_adasyn = {
    'adasyn__sampling_strategy': [0.2],
    'adasyn__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("ADASYN", pipeline_adasyn, param_grid_adasyn,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)




# ADASYN + Tomek
pipeline_adasyn_tomek = Pipeline([
    ('scaler', StandardScaler()),
    ('adasyn_tomek', ADASYN(random_state=RANDOM_STATE)),
    ('tomek', TomekLinks()),
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_adasyn_tomek = {
    'adasyn_tomek__sampling_strategy': [0.2],
    'adasyn_tomek__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
}

results = run_sampler_gridsearch("ADASYN + Tomek", pipeline_adasyn_tomek, param_grid_adasyn_tomek,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# Random Undersampling
pipeline_under = Pipeline([
    ('scaler', StandardScaler()),
    ('under', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_under = {}

results = run_sampler_gridsearch("Random Undersampling", pipeline_under, param_grid_under,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)



# Random Oversampling
pipeline_over = Pipeline([
    ('scaler', StandardScaler()),
    ('over', RandomOverSampler(random_state=RANDOM_STATE)),
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_over = {}

results = run_sampler_gridsearch("Random Oversampling", pipeline_over,
    param_grid_over,
    X_train, y_train, X_test, y_test, amt_test,
    results, random_state=RANDOM_STATE)



# Class weighting
weights = np.linspace(0, 100, 5)

pipeline_cw = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=500,
        n_jobs=-1
    ))
])

param_grid_cw = {
    'clf__class_weight': [{0: w0, 1: 100 - w0} for w0 in weights]
}

results = run_sampler_gridsearch("Class Weights", pipeline_cw, param_grid_cw,
                                 X_train, y_train, X_test, y_test, amt_test,
                                 results, random_state=RANDOM_STATE)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END .................................................... total time=  25.0s
[CV] END .................................................... total time=  25.5s
[CV] END .................................................... total time=  25.8s


In [116]:
results_df = pd.DataFrame(results)
results_df_amount_scaled = results_df
results_df_amount_scaled.to_csv('results_amount_scaled.csv', index=False)
results_df_amount_scaled

Unnamed: 0,Method,Best Params,Precision,Recall (Macro),F1,FCR (%),Net Savings
0,Random Forest (untuned),{},0.918367,0.851508,0.79646,70.3125,-670.45
1,SMOTE,"{'smote__k_neighbors': 13, 'smote__sampling_st...",0.78125,0.890436,0.78125,78.125,-423.97
2,SMOTE + Tomek,"{'smote_tomek__k_neighbors': 13, 'smote_tomek_...",0.78125,0.890436,0.78125,78.125,-423.97
3,ADASYN,"{'adasyn__n_neighbors': 13, 'adasyn__sampling_...",0.78125,0.890436,0.78125,78.125,-423.97
4,ADASYN + Tomek,"{'adasyn_tomek__n_neighbors': 13, 'adasyn_tome...",0.78125,0.890436,0.78125,78.125,-423.97
5,Random Undersampling,{},0.057436,0.925068,0.107796,87.5,4771.63
6,Random Oversampling,{},0.923077,0.874946,0.827586,75.0,-430.41
7,Class Weights,"{'clf__class_weight': {0: 0.0, 1: 100.0}}",0.001729,0.5,0.003451,100.0,7177.73


<div style="background-color: #215880; padding: 10px; border-radius: 5px;">
    <h3 style="color: #e6f1fa; font-size: 30px; font-weight: bold; margin: 10px;">REFERENCES</h3>
</div>