#### Importation des librairies :

In [11]:
import os
import pandas as pd
import numpy as np
import zipfile
import gc
import mlflow
import mlflow.sklearn

#### Importation du fichier : 

In [9]:
# Chemin vers le dossier contenant les fichiers extraits
dossier_data = "C:/Users/paulm/Documents/GitHub/Projet7/data"

# Liste des fichiers dans le dossier data
fichiers_data = os.listdir(dossier_data)
print(f"Fichiers dans le dossier data : {fichiers_data}")

# Exemple de chargement d'un fichier CSV
# Supposons que vous ayez un fichier CSV nommé "train.csv" dans le dossier data
chemin_fichier_train = os.path.join(dossier_data, 'application_test.csv')
train_data = pd.read_csv(chemin_fichier_train)

# Afficher les premières lignes du DataFrame pour vérifier que le fichier a été chargé correctement
print(train_data.head())


Fichiers dans le dossier data : ['application_test.csv', 'application_train.csv', 'bureau.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'HomeCredit_columns_description.csv', 'installments_payments.csv', 'POS_CASH_balance.csv', 'previous_application.csv', 'Projet+Mise+en+prod+-+home-credit-default-risk (1).zip', 'sample_submission.csv']
   SK_ID_CURR NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
0      100001         Cash loans           F            N               Y   
1      100005         Cash loans           M            N               Y   
2      100013         Cash loans           M            Y               Y   
3      100028         Cash loans           F            N               Y   
4      100038         Cash loans           M            Y               N   

   CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  AMT_GOODS_PRICE  \
0             0          135000.0    568800.0      20560.5         450000.0   
1             0           99000.

#### Initialisation de l'environnement MLFlow :

In [10]:
pip install mlflow

Note: you may need to restart the kernel to use updated packages.


In [14]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

## Préparation des données et feature engineering :

### Étape 1: Prétraitement des données et sauvegarde des résultats intermédiaires

In [15]:
import pandas as pd
import os
import gc
import mlflow

# Chemin vers le dossier contenant les fichiers extraits
dossier_data = "C:/Users/paulm/Documents/GitHub/Projet7/data"

# Démarrer une nouvelle exécution d'expérience MLflow
mlflow.start_run()

# Fonction pour charger les données
def load_data(file_name):
    return pd.read_csv(os.path.join(dossier_data, file_name))

# Charger les données d'application_train
df = load_data('application_train.csv')

# Enregistrer la taille des données
mlflow.log_metric("application_train_rows", df.shape[0])
mlflow.log_metric("application_train_columns", df.shape[1])

# Charger et traiter les autres fichiers de données

# Charger bureau.csv
bureau = load_data('bureau.csv')

# Charger bureau_balance.csv
bureau_balance = load_data('bureau_balance.csv')

# Charger previous_application.csv
previous_applications = load_data('previous_application.csv')

# Charger POS_CASH_balance.csv
pos_cash_balance = load_data('POS_CASH_balance.csv')

# Charger installments_payments.csv
installments_payments = load_data('installments_payments.csv')

# Charger credit_card_balance.csv
credit_card_balance = load_data('credit_card_balance.csv')

# Fonction pour le traitement du bureau et de l'équilibre du bureau
def process_bureau_and_balance(bureau, bureau_balance, num_rows=None, nan_as_category=True):
    bb, bb_cat = one_hot_encoder(bureau_balance, nan_as_category=True)
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'mean', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category=True)
    bureau_aggregations = {'DAYS_CREDIT': ['min', 'max', 'mean'],
                           'CREDIT_DAY_OVERDUE': ['max', 'mean'],
                           'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
                           'DAYS_ENDDATE_FACT': ['min', 'max', 'mean'],
                           'AMT_CREDIT_MAX_OVERDUE': ['mean'],
                           'CNT_CREDIT_PROLONG': ['sum'],
                           'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
                           'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
                           'AMT_CREDIT_SUM_OVERDUE': ['mean'],
                           'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
                           'AMT_ANNUITY': ['max', 'mean', 'sum'],
                           'MONTHS_BALANCE_MIN': ['min'],
                           'MONTHS_BALANCE_MAX': ['max'],
                           'MONTHS_BALANCE_SIZE': ['mean', 'sum']}
    for col in bureau_cat:
        bureau_aggregations[col] = ['mean']
    bureau_agg = bureau.groupby('SK_ID_CURR').agg(bureau_aggregations)
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    del bureau, bb
    gc.collect()
    return bureau_agg

# Fonction pour le traitement des applications précédentes
def process_previous_applications(previous_applications, num_rows=None, nan_as_category=True):
    prev, cat_cols = one_hot_encoder(previous_applications, nan_as_category=True)
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        'DAYS_FIRST_DRAWING': ['min', 'max', 'mean'],
        'DAYS_FIRST_DUE': ['min', 'max', 'mean'],
        'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
        'DAYS_LAST_DUE': ['min', 'max', 'mean'],
        'DAYS_TERMINATION': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    prev_agg = prev.groupby('SK_ID_CURR').agg(aggregations)
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    del prev
    gc.collect()
    return prev_agg

# Fonction pour le traitement du solde POS-CASH
def process_pos_cash(pos_cash_balance, num_rows=None, nan_as_category=True):
    pos, cat_cols = one_hot_encoder(pos_cash_balance, nan_as_category=True)
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg

# Fonction pour le traitement des paiements d'installations
def process_installments_payments(installments_payments, num_rows=None, nan_as_category=True):
    ins, cat_cols = one_hot_encoder(installments_payments, nan_as_category=True)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    ins['DAYS_LATE_PAYMENT'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DAYS_EARLY_PAYMENT'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DAYS_INSTALMENT': ['max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'PAYMENT_PERC': ['mean', 'sum'],
        'PAYMENT_DIFF': ['mean', 'sum'],
        'DAYS_LATE_PAYMENT': ['mean', 'sum', 'max'],
        'DAYS_EARLY_PAYMENT': ['mean', 'sum', 'max']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    del ins
    gc.collect()
    return ins_agg

# Fonction pour le traitement du solde de la carte de crédit
def process_credit_card_balance(credit_card_balance, num_rows=None, nan_as_category=True):
    cc, cat_cols = one_hot_encoder(credit_card_balance, nan_as_category=True)
    cc.drop(['SK_ID_PREV'], axis=1, inplace=True)
    cc_aggregations = {
        'MONTHS_BALANCE': ['min', 'max', 'mean', 'size'],
        'AMT_BALANCE': ['min', 'max', 'mean', 'sum'],
        'AMT_CREDIT_LIMIT_ACTUAL': ['min', 'max', 'mean', 'sum'],
        'AMT_DRAWINGS_ATM_CURRENT': ['min', 'max', 'mean', 'sum'],
        'AMT_DRAWINGS_CURRENT': ['min', 'max', 'mean', 'sum'],
        'AMT_DRAWINGS_OTHER_CURRENT': ['min', 'max', 'mean', 'sum'],
        'AMT_DRAWINGS_POS_CURRENT': ['min', 'max', 'mean', 'sum'],
        'AMT_INST_MIN_REGULARITY': ['min', 'max', 'mean', 'sum'],
        'AMT_PAYMENT_CURRENT': ['min', 'max', 'mean', 'sum'],
        'AMT_PAYMENT_TOTAL_CURRENT': ['min', 'max', 'mean', 'sum'],
        'AMT_RECEIVABLE_PRINCIPAL': ['min', 'max', 'mean', 'sum'],
        'AMT_RECIVABLE': ['min', 'max', 'mean', 'sum'],
        'AMT_TOTAL_RECEIVABLE': ['min', 'max', 'mean', 'sum'],
        'CNT_DRAWINGS_ATM_CURRENT': ['min', 'max', 'mean', 'sum'],
        'CNT_DRAWINGS_CURRENT': ['min', 'max', 'mean', 'sum'],
        'CNT_DRAWINGS_OTHER_CURRENT': ['min', 'max', 'mean', 'sum'],
        'CNT_DRAWINGS_POS_CURRENT': ['min', 'max', 'mean', 'sum'],
        'CNT_INSTALMENT_MATURE_CUM': ['min', 'max', 'mean', 'sum']
    }
    for cat in cat_cols:
        cc_aggregations[cat] = ['mean']
    cc_agg = cc.groupby('SK_ID_CURR').agg(cc_aggregations)
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    del cc
    gc.collect()
    return cc_agg

# Fonction pour encoder en one-hot et gérer les valeurs manquantes
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Appeler les fonctions de traitement
bureau_processed = process_bureau_and_balance(bureau, bureau_balance)
previous_app_processed = process_previous_applications(previous_applications)
pos_cash_processed = process_pos_cash(pos_cash_balance)
installments_processed = process_installments_payments(installments_payments)
credit_card_processed = process_credit_card_balance(credit_card_balance)

# Fusionner tous les dataframes traités avec le dataframe principal (df)
df = df.merge(bureau_processed, on='SK_ID_CURR', how='left')
df = df.merge(previous_app_processed, on='SK_ID_CURR', how='left')
df = df.merge(pos_cash_processed, on='SK_ID_CURR', how='left')
df = df.merge(installments_processed, on='SK_ID_CURR', how='left')
df = df.merge(credit_card_processed, on='SK_ID_CURR', how='left')

# Enregistrer les données traitées dans un fichier CSV
processed_data_file = os.path.join(dossier_data, 'processed_data.csv')
df.to_csv(processed_data_file, index=False)

# Enregistrer les paramètres finaux et les métriques finales dans MLflow
mlflow.log_param("final_dataframe_shape", df.shape)
mlflow.log_metric("final_dataframe_rows", df.shape[0])
mlflow.log_metric("final_dataframe_columns", df.shape[1])

# Terminer l'exécution de l'expérience MLflow
mlflow.end_run()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec