In [11]:
import evidently
import time
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently import ColumnMapping
import pandas as pd
import numpy as np

# 1. Chargement des datasets
df_train = pd.read_csv('application_train.csv', sep=",")
df_test = pd.read_csv('application_test.csv', sep=",")

# 2. Séparation des jeux de données
application_train = df_train.dropna(subset=['TARGET']).drop(columns=['SK_ID_CURR','TARGET'])
application_test = df_test.drop(columns=['SK_ID_CURR'])

# 3. Identification des colonnes numériques et catégorielles
numerical_columns = application_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = application_train.select_dtypes(include=[object]).columns.tolist()

# 4. Suppression des colonnes vides
application_train = application_train.dropna(axis=1, how='all')
application_test = application_test.dropna(axis=1, how='all')

# Mise à jour des colonnes après suppression des colonnes vides
numerical_columns = [col for col in numerical_columns if col in application_train.columns]
categorical_columns = [col for col in categorical_columns if col in application_train.columns]

# 5. Remplacement des valeurs infinies par NaN
application_train.replace([np.inf, -np.inf], np.nan, inplace=True)
application_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# 6. Remplacement des valeurs manquantes pour les colonnes numériques
application_train[numerical_columns] = application_train[numerical_columns].apply(pd.to_numeric, errors='coerce').fillna(application_train[numerical_columns].mean())
application_test[numerical_columns] = application_test[numerical_columns].apply(pd.to_numeric, errors='coerce').fillna(application_test[numerical_columns].mean())

# 7. Remplacement des valeurs manquantes pour les colonnes catégorielles
application_train[categorical_columns] = application_train[categorical_columns].fillna('missing')
application_test[categorical_columns] = application_test[categorical_columns].fillna('missing')

# 8. Création du column mapping pour Evidently
column_mapping = ColumnMapping()
column_mapping.numerical_features = numerical_columns
column_mapping.categorical_features = categorical_columns

# 9. Création du rapport de Data Drift
data_drift_report = Report(metrics=[
    DataDriftPreset(num_stattest='ks', cat_stattest='psi', num_stattest_threshold=0.2, cat_stattest_threshold=0.2),
])

print("Création du data_drift_report")

# 10. Exécution du rapport
start_time = time.time()
data_drift_report.run(reference_data=application_train, current_data=application_test, column_mapping=column_mapping)
elapsed_time_fit = time.time() - start_time
print(f"Temps d'exécution : {elapsed_time_fit:.2f} secondes")

# 11. Sauvegarde du rapport en tant que fichier HTML
data_drift_report.save_html('data_drift_report_.html')
print("Rapport sauvegardé sous 'data_drift_report.html'")


Création du data_drift_report



invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



Temps d'exécution : 70.31 secondes
Rapport sauvegardé sous 'data_drift_report.html'
