# Using undersampling to reequilibrate the classes, and the reiterate a selection of stacking model on the rebalanced dataset.

**Strategy:**
- For each validation set / For each model:
    - Undersample the majority category (down to 10 % ?)
    - SMOTE the minority class (up to 25%)
    - Train select the best hyperparameters for various models
    - Stack the models
    - Apply on the full test dataset

**To be tried**
- Tomek link (When classes overlap)
- NearMiss (When decision boundary matters)

## Library loading

In [2]:
# path managememnt
import os

# print path
print("Current working directory:", os.getcwd())
# change path
os.chdir("C:/Users/axel-/Documents/Coding/SISE_FraudAnalysis")
os.getcwd()

Current working directory: C:\Users\axel-\Documents\Coding\SISE_FraudAnalysis


'C:\\Users\\axel-\\Documents\\Coding\\SISE_FraudAnalysis'

In [21]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import pickle
from sklearn.metrics import make_scorer, f1_score
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
import mlflow
import time

## Data preparation

In [4]:
with open('data/cleaned_data.pkl', 'rb') as f:
    df = pickle.load(f)

In [5]:
df.head()

Unnamed: 0,ZIBZIN,IDAvisAutorisationCheque,FlagImpaye,Montant,DateTransaction,CodeDecision,VerifianceCPT1,VerifianceCPT2,VerifianceCPT3,D2CB,...,TauxImpNB_CPM,EcartNumCheq,NbrMagasin3J,DiffDateTr1,DiffDateTr2,DiffDateTr3,CA3TRetMtt,CA3TR,Heure,JourSemaine
0,A013010004908126703060931,78643044,0,20.0,2017-02-01 07:32:14,1,0,0,0,551,...,52.076034,0,1,4.0,4.0,4.0,20.0,0.0,27134,2
1,A013011306908024927155000,78643045,0,20.0,2017-02-01 07:43:37,1,0,0,0,551,...,52.076034,1,2,1.797685,4.0,4.0,28.61,8.61,27817,2
2,A013010002908283134592527,78643046,0,57.64,2017-02-01 07:47:38,1,0,0,0,549,...,52.076034,0,1,4.0,4.0,4.0,57.64,0.0,28058,2
3,A011010002908105209831316,78643047,0,54.29,2017-02-01 07:48:48,0,1,1,1,267,...,53.554234,0,1,4.0,4.0,4.0,54.29,0.0,28128,2
4,A013010041908000125652029,78643048,0,26.9,2017-02-01 08:13:27,1,0,0,0,549,...,52.076034,1,1,1.997106,4.0,4.0,59.15,32.25,29607,2


In [6]:
# Changing type of EcartNumCheq column
df["EcartNumCheq"] = df["EcartNumCheq"].astype("int")

In [7]:
train_index = (df['DateTransaction'] >= '2017-02-01') & (df['DateTransaction'] <= '2017-08-31')
test_index = (df['DateTransaction'] >= '2017-09-01') & (df['DateTransaction'] <= '2017-11-30')

train = df[train_index]
test = df[test_index]
    
del train_index, test_index

# Variable to discard
to_discard = ['ZIBZIN', 'IDAvisAutorisationCheque', 'DateTransaction','CodeDecision']

In [8]:
test = test.drop(columns=to_discard)
train = train.drop(columns=to_discard)

y_train = train['FlagImpaye']
X_train = train.drop(columns=['FlagImpaye'])
y_test = test['FlagImpaye']
X_test = test.drop(columns=['FlagImpaye'])

In [9]:
# Scorer on positive class (fraud cases)
f1_fraud_scorer = make_scorer(
    f1_score,
    pos_label=1,  # Focus sur les fraudes
    average='binary'  # Binary classification
)

## Random Undersampling  

In [10]:
unique, counts = np.unique(y_train, return_counts=True)
print(unique, counts)

[0 1] [3865122   23346]


In [11]:
target_ratio = 0.20
# define strategy for undersampling
sampling = {
    0: counts[1],  
    1: counts[1]     
    }

In [12]:
# undersample train set down to 10 % of the minority class
rus = RandomUnderSampler(sampling_strategy = 0.10)

In [13]:
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [14]:
y_resampled.value_counts(normalize=True)

FlagImpaye
0    0.909091
1    0.090909
Name: proportion, dtype: float64

In [15]:
# apply SMOTE to resampled data to reach at least 20% of minority class
smote = SMOTE(sampling_strategy=0.25)
X_final, y_final = smote.fit_resample(X_resampled, y_resampled)

In [16]:
y_final.value_counts(normalize=True)

FlagImpaye
0    0.8
1    0.2
Name: proportion, dtype: float64

In [17]:
print("Number of samples after undersampling and SMOTE:")
print(y_final.value_counts())

Number of samples after undersampling and SMOTE:
FlagImpaye
0    233460
1     58365
Name: count, dtype: int64


# Pipeline configuration

In [18]:
cv = StratifiedKFold(n_splits=5, shuffle=True)

# Define the percentage of minority class after resampling
target_undersampler = 0.10
target_smote = 0.25


# define models and their hyperparameters (keep the same sampling strategy in the pipeline)
models_params = {
    'RandomForest': {
        'pipeline': Pipeline([
            ('rus', RandomUnderSampler(sampling_strategy=target_undersampler, )),
            ('smote', SMOTE(sampling_strategy=target_smote, )),
            ('model', RandomForestClassifier(n_jobs=-1))
        ]),
        'params': {
            'model__n_estimators': [200, 300, 500],
            'model__max_depth': [15, 20],
            'model__min_samples_split': [5, 10],
            'model__class_weight': [{0: 1, 1: 10}, {0:1, 1:20},  {0:1, 1:15} , 'balanced']
        }
    },

    'LightGBM': {
        'pipeline': Pipeline([
            ('rus', RandomUnderSampler(sampling_strategy=target_undersampler, )),
            ('smote', SMOTE(sampling_strategy=target_smote, )),
            ('model', LGBMClassifier(n_jobs=-1, verbose=-1))
        ]),
        'params': {
            'model__n_estimators': [300, 400],
            'model__max_depth': [7, 10],
            'model__learning_rate': [0.05, 0.1],
            'model__num_leaves': [31, 50],
            'model__class_weight': [{0: 1, 1: 10}, {0:1, 1:20},  {0:1, 1:15} , 'balanced']
        }
    },

    'XGBoost': {
        'pipeline': Pipeline([
            ('rus', RandomUnderSampler(sampling_strategy=target_undersampler, )),
            ('smote', SMOTE(sampling_strategy=target_smote, )),
            ('model', HistGradientBoostingClassifier(max_iter=300))
        ]),
        'params': {
            'model__n_estimators': [200, 300],
            'model__max_depth': [5, 7],
            'model__learning_rate': [0.05, 0.1],
            'model__scale_pos_weight': [3, 4],
            'model__subsample': [0.8, 0.9],
            'model__class_weight': [{0: 1, 1: 10}, {0:1, 1:20},  {0:1, 1:15} , 'balanced']
        }
    },
    
    # LogisticRegression
    'logistic_regression': {
        'pipeline': Pipeline([
            ('rus', RandomUnderSampler(sampling_strategy=target_undersampler, )),
            ('smote', SMOTE(sampling_strategy=target_smote, )),
            ('model', LogisticRegression(max_iter=1000, n_jobs=-1))
        ]),
        'params': {
            'model__C': [0.01, 0.1, 1.0, 10.0],
            'model__penalty': ['l2'],
            'model__class_weight': [{0: 1, 1: 10}, {0:1, 1:20},  {0:1, 1:15} , 'balanced']
        }
    },

        # Neural network
    'neural_network': {
        'pipeline': Pipeline([
            ('rus', RandomUnderSampler(sampling_strategy=target_undersampler, )),
            ('smote', SMOTE(sampling_strategy=target_smote, )),
            ('model', MLPClassifier(max_iter=500))
        ]),
        'params': {
            'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'model__activation': ['relu', 'tanh'],
            'model__alpha': [0.0001, 0.001, 0.01],
        }
    },
}

In [None]:
# Set up the gridsearch with cross validation and mlflow

mlflow.set_experiment("sampling_models_experiments")

for model_name, mp in models_params.items():

    with mlflow.start_run(run_name=model_name):
        print(f"Starting GridSearchCV for {model_name}...")
        grid_search = GridSearchCV(
            estimator=mp['pipeline'],
            param_grid=mp['params'],
            scoring=f1_fraud_scorer,
            cv=cv,
            n_jobs=-1,
            verbose=2
        )


        start_time = time.time()

        # Fit the grid search
        grid_search.fit(X_train, y_train)

        end_time = time.time()

        # log the best parameters and the best score
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("best_f1_score", grid_search.best_score_)
        mlflow.sklearn.log_model(grid_search.best_estimator_, artifact_path="model")
        mlflow.log_metric("training_time_seconds", end_time - start_time)
        
        
        print(f"Completed GridSearchCV for {model_name}. Best F1 Score: {grid_search.best_score_}")

2026/01/13 21:50:24 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/13 21:50:24 INFO mlflow.store.db.utils: Updating database tables
2026/01/13 21:50:24 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/13 21:50:24 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/13 21:50:24 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2026/01/13 21:50:24 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2026/01/13 21:50:24 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2026/01/13 21:50:24 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2026/01/13 21:50:24 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2026/01/13 21:50:24 INFO alembic.runtime.migration: Running 

Starting GridSearchCV for RandomForest...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'

In [None]:
time.time()

Exemple of pipeline
```python
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_predict
import numpy as np

# Base model 1: full dataset
model_full = RandomForestClassifier(random_state=42)

# Base model 2: undersampled
model_under = Pipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('clf', GradientBoostingClassifier())
])

# Get OOF predictions
oof_full = cross_val_predict(model_full, X_train, y_train, cv=5, method='predict_proba')[:,1]
oof_under = cross_val_predict(model_under, X_train, y_train, cv=5, method='predict_proba')[:,1]

# Stack predictions as features for meta-model
X_meta = np.column_stack([oof_full, oof_under])

meta_model = LogisticRegression()
meta_model.fit(X_meta, y_train)

# Test predictions
pred_full_test = model_full.fit(X_train, y_train).predict_proba(X_test)[:,1]
pred_under_test = model_under.fit(X_train, y_train).predict_proba(X_test)[:,1]
X_meta_test = np.column_stack([pred_full_test, pred_under_test])
final_pred = meta_model.predict(X_meta_test)
```