In [1]:
import os
os.environ["MLFLOW_TRACKING_URI"] = "http://127.0.0.1:5000"

In [2]:
# Import des biblioth√®ques n√©cessaires
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# Charger les donn√©es pr√©trait√©es
app_train = pd.read_csv("Base De Donn√©e Pr√©trait√©e.csv")
app_test = pd.read_csv("Base De Donn√©e Pr√©trait√©e Test.csv")

In [4]:
# S√©parer les caract√©ristiques et la cible
if 'TARGET' in app_train:
    train_labels = app_train['TARGET']
    train = app_train.drop(columns=['TARGET'])
else:
    train = app_train.copy()
    
# Pr√©paration des donn√©es de test
test = app_test.copy()

# Imputation m√©diane des valeurs manquantes
imputer = SimpleImputer(strategy='median')
scaler = MinMaxScaler(feature_range=(0, 1))

# Ajuster l'imputer sur les donn√©es d'entra√Ænement et transformer
train = imputer.fit_transform(train)
test = imputer.transform(test)

# Ajuster le scaler sur les donn√©es d'entra√Ænement et transformer
train = scaler.fit_transform(train)
test = scaler.transform(test)

# S√©paration des donn√©es d'entra√Ænement pour validation
X_train, X_test, y_train, y_test = train_test_split(train, train_labels, test_size=0.3, random_state=0)

In [5]:
# D√©finir la grille des hyperparam√®tres
param_grid = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

In [6]:
# Initialiser le mod√®le
log_reg = LogisticRegression(random_state=0)

In [7]:
# Configurer le GridSearchCV
grid_search = GridSearchCV(estimator=log_reg, 
                           param_grid=param_grid, 
                           scoring='roc_auc', 
                           cv=5, 
                           n_jobs=-1, 
                           verbose=1)

In [8]:
# D√©marrer une exp√©rimentation MLFlow
with mlflow.start_run():
    # Ex√©cuter la recherche en grille
    grid_search.fit(X_train, y_train)
    
    # Obtenir les meilleurs param√®tres et les meilleures performances
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Entra√Æner le meilleur mod√®le sur l‚Äôensemble d‚Äôentra√Ænement complet
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)
    
    # √âvaluation sur l‚Äôensemble de test
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    fp = conf_matrix[0][1]
    fn = conf_matrix[1][0]
    cost = fp + 10 * fn

    # Enregistrement des m√©triques et param√®tres dans MLFlow
    mlflow.log_params(best_params)
    mlflow.log_metric("Best CV AUC", best_score)
    mlflow.log_metric("Test AUC", auc)
    mlflow.log_metric("Test Accuracy", accuracy)
    mlflow.log_metric("Test Cost", cost)
    mlflow.log_param("model_type", "LogisticRegression")

    # Loguer le meilleur mod√®le
    mlflow.sklearn.log_model(best_model, "best_logistic_regression_model")

    # Afficher les r√©sultats
    print("Best Parameters found by GridSearchCV:", best_params)
    print(f"Best CV AUC Score: {best_score}")
    print(f"Test AUC: {auc}")
    print(f"Test Accuracy: {accuracy}")
    print(f"Test Cost: {cost}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters found by GridSearchCV: {'C': 10, 'solver': 'liblinear'}
Best CV AUC Score: 0.7465902314585946
Test AUC: 0.7485621372447766
Test Accuracy: 0.9194614867648015
Test Cost: 73220


- Tester tous les mod√®les sans gridsearchCV (1 ou 2 par grandes familles)
- Selectionner les meilleurs mod√®les car ce ne sont pas les hyperparam√®tres qui changent beaucoup. 
- Tester les valeurs extr√™mes des hyperparam√®tres pour savoir dans quel direction aller. 
- Ensuite on peut choisir le meilleur mod√®le et faire √©voluer les hyperparam√®tres. 
- Un fois que le gridsearchCV est termin√©, on n'en fait plus et on reste sur les valeurs selectionn√©es. 
- Il faut trouver trouver des mod√®les avec un features importance globale et locale (pas de blackbox)

Pour le d√©s√©quilibre, regarder les mod√®les qui int√®grent les d√©s√©quilibres. Ou alors, cr√©er des faux clients qui n'auraient pas et de cr√©tit (regarder SMOTE) Data Augmentation

DataDRIFT
Introduction de la notion de temps en se pla√ßant dans le pass√©. Se mettre comme si on √©tait en 2022 par exemple. 