Data drift analysis using evidently library, between application_train (reference dataset) and application_test (current dataset under analysis).

In [161]:
#pip install evidently

In [162]:
import pandas as pd
from sklearn import datasets

from evidently import Dataset
from evidently import DataDefinition
from evidently import Report
from evidently.presets import DataSummaryPreset

In [163]:
# Load reference and current datasets for data drift Evidently analysis
reference_dataset = pd.read_csv('./data/application_train.csv')
current_dataset = pd.read_csv('./data/application_test.csv')
reference_dataset.drop(columns="TARGET", inplace=True)

In [164]:
# Check reference and current datasets shape
reference_dataset.shape, current_dataset.shape

((307511, 121), (48744, 121))

In [165]:
# Check all columns have same dtypes in reference and current datasets
reference_dataset.dtypes.to_list() == current_dataset.dtypes.to_list()

True

In [166]:
# Categories of features
reference_dataset.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [167]:
reference_dataset.dtypes

SK_ID_CURR                      int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 121, dtype: object

In [168]:
# Extract a list of numerical and categorical features
numerical_columns = []
categorical_columns = []
for col, typ in zip(reference_dataset.columns, reference_dataset.dtypes):    
    if typ in ["int64", "float64"]:
        numerical_columns.append(col)
    else:
        categorical_columns.append(col)

In [169]:
schema = DataDefinition(
    numerical_columns=numerical_columns,
    categorical_columns=categorical_columns,
    )

In [170]:
eval_ref = Dataset.from_pandas(
    pd.DataFrame(reference_dataset),
    data_definition=schema
)

In [171]:
eval_cur = Dataset.from_pandas(
    pd.DataFrame(current_dataset),
    data_definition=schema
)

In [159]:
report = Report([DataSummaryPreset()], include_tests="True")
my_eval = report.run(eval_ref, eval_cur)

In [160]:
my_eval.save_html("data_drift_analysis.html")