# Data loading and preprocessing

In [2]:
# import file
with open("../data/cleaned_data.pkl", "rb") as f:
    df = pickle.load(f)

In [3]:
df

Unnamed: 0,ZIBZIN,IDAvisAutorisationCheque,FlagImpaye,Montant,DateTransaction,CodeDecision,VerifianceCPT1,VerifianceCPT2,VerifianceCPT3,D2CB,...,TauxImpNB_CPM,EcartNumCheq,NbrMagasin3J,DiffDateTr1,DiffDateTr2,DiffDateTr3,CA3TRetMtt,CA3TR,Heure,JourSemaine
0,A013010004908126703060931,78643044,0,20.00,2017-02-01 07:32:14,1,0,0,0,551,...,52.076034,0,1,4.000000,4.0,4.0,20.00,0.00,27134,2
1,A013011306908024927155000,78643045,0,20.00,2017-02-01 07:43:37,1,0,0,0,551,...,52.076034,1,2,1.797685,4.0,4.0,28.61,8.61,27817,2
2,A013010002908283134592527,78643046,0,57.64,2017-02-01 07:47:38,1,0,0,0,549,...,52.076034,0,1,4.000000,4.0,4.0,57.64,0.00,28058,2
3,A011010002908105209831316,78643047,0,54.29,2017-02-01 07:48:48,0,1,1,1,267,...,53.554234,0,1,4.000000,4.0,4.0,54.29,0.00,28128,2
4,A013010041908000125652029,78643048,0,26.90,2017-02-01 08:13:27,1,0,0,0,549,...,52.076034,1,1,1.997106,4.0,4.0,59.15,32.25,29607,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4646769,A049010278908950520989501,84966399,0,23.94,2017-11-30 21:20:15,0,0,0,0,1,...,43.030421,0,1,5.000000,5.0,5.0,23.94,0.00,76815,3
4646770,A075000438908140000997961,84966400,0,92.60,2017-11-30 21:22:49,0,0,0,0,514,...,36.813027,1,2,1.035949,4.0,4.0,262.45,169.85,76969,3
4646771,A075000002908324024533014,84966401,0,69.00,2017-11-30 21:24:37,0,0,0,0,1,...,52.076034,0,1,5.000000,5.0,5.0,69.00,0.00,77077,3
4646772,A078010004908211306074580,84966402,0,57.80,2017-11-30 21:57:32,0,0,0,0,1,...,82.051282,0,1,5.000000,5.0,5.0,57.80,0.00,79052,3


In [4]:
# Changing type of EcartNumCheq column
df["EcartNumCheq"] = df["EcartNumCheq"].astype("int")

In [None]:
train_index = (df['DateTransaction'] >= '2017-02-01') & (df['DateTransaction'] <= '2017-08-31')
test_index = (df['DateTransaction'] >= '2017-09-01') & (df['DateTransaction'] <= '2017-11-30')

train = df[train_index]
test = df[test_index]

del train_index, test_index

# Variable to discard
to_discard = ['ZIBZIN', 'IDAvisAutorisationCheque', 'DateTransaction','CodeDecision']

In [6]:
test = test.drop(columns=to_discard)
train = train.drop(columns=to_discard)

y_train = train['FlagImpaye']
X_train = train.drop(columns=['FlagImpaye'])
y_test = test['FlagImpaye']
X_test = test.drop(columns=['FlagImpaye'])

# Pipeline preparation

In [None]:
models_config = {
    "xgboost": {
    "model": HistGradientBoostingClassifier(
        loss="log_loss"
    ),
    "params": {
        "max_iter": [200, 400],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 5],
        "min_samples_leaf": [20, 50],
        "l2_regularization": [0.0, 0.1],
        "max_bins": [255],
        "class_weight": [{0: 1, 1: 10}, {0: 1, 1: 20}, {0: 1, 1: 5}],
    }
    },

    "RandomForest": {
        "model": RandomForestClassifier(n_jobs=-1),
        "params": {
            "n_estimators": [200, 300],
            "max_depth": [20, 30],  
            "class_weight": [{0: 1, 1: 10}, {0: 1, 1: 20}, {0: 1, 1: 5}],
        }
    },
}

In [30]:
# Scorer on positive class (fraud cases)
f1_fraud_scorer = make_scorer(
    f1_score,
    pos_label=1,  # Focus sur les fraudes
    average='binary'  # Binary classification
)

In [None]:
# Stratified K-Fold Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True)

for model_name, config in models_config.items():
    models_config[model_name]["grid_search"] = GridSearchCV(
        config["model"],
        param_grid=config["params"],
        cv=cv,
        scoring=f1_fraud_scorer,
        n_jobs=-1,
        verbose=2,
        return_train_score=True)



# Setting up the MLflow experiment
mlflow.set_experiment("full_dataset_models")

# Training using mlflow tracking
for model_name in models_config.keys():

    with mlflow.start_run(run_name=model_name):
        print(f"Training model: {model_name}")

        gs = models_config[model_name]["grid_search"]
        gs.fit(X_train, y_train)

        mlflow.log_params(gs.best_params_)
        mlflow.log_metric("best_score", gs.best_score_)
        mlflow.log_metric(
            "best_train_score",
            gs.cv_results_['mean_train_score'][gs.best_index_]
        )

        mlflow.sklearn.log_model(
            gs.best_estimator_,
            artifact_path=f"model_{model_name}"
        )

        results_df = pd.DataFrame(gs.cv_results_)
        results_df.to_csv("cv_results.csv", index=False)
        mlflow.log_artifact("cv_results.csv")

        print(f"Run ID: {mlflow.active_run().info.run_id}")

Training model: xgboost
Fitting 5 folds for each of 64 candidates, totalling 320 fits




Run ID: fb37c37e59d34ab98e667c2c3be12d67
Training model: RandomForest
Fitting 5 folds for each of 72 candidates, totalling 360 fits




MemoryError: Unable to allocate 11.9 MiB for an array with shape (3110774, 1) and data type int32

## Maunal exploration of randomforest

- RandomForestClassifier(n_jobs=-1, n_estimators=200, class_weight={0: 1, 1: 15}, max_depth=10)   -> 13%

In [27]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=300, class_weight={0: 1, 1: 20}, max_depth=10)

In [28]:
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [25]:
prediction = clf.predict(X_test)

In [26]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    730582
           1       0.13      0.14      0.13      6485

    accuracy                           0.98    737067
   macro avg       0.56      0.56      0.56    737067
weighted avg       0.98      0.98      0.98    737067

