## Data Validation Experiment using evidently

In [1]:
import pandas as pd 
from evidently.dashboard import Dashboard
from evidently.tabs import DataDriftTab , CatTargetDriftTab
from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection

In [2]:
# load the data
df = pd.read_csv("wine.csv")
df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,8
1,white,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,6
2,red,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,5
3,white,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,6
4,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7


In [3]:
X = df.drop(columns = ['quality'])
y = df['quality']

In [4]:
from sklearn.model_selection import train_test_split

train_set , test_set = train_test_split(df , test_size = 0.2 , random_state = 42)

In [5]:
train_set.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1916,white,6.1,0.16,0.24,1.4,0.046,17.0,77.0,0.99319,3.66,0.57,10.3,6
947,white,6.9,0.33,0.26,5.0,0.027,46.0,143.0,0.9924,3.25,0.43,11.2,7
877,white,5.6,0.26,0.18,1.4,0.034,18.0,135.0,0.99174,3.32,0.35,10.2,6
2927,white,5.6,0.41,0.22,7.1,0.05,44.0,154.0,0.9931,3.3,0.4,10.5,5
6063,white,7.2,0.13,0.46,1.3,0.044,48.0,111.0,0.99127,2.97,0.45,11.1,5


In [6]:
data_drift_dashboard = Dashboard(tabs = [DataDriftTab()])

In [7]:
data_drift_dashboard.calculate(reference_data = train_set , current_data = test_set)

In [8]:
# save as a html file
data_drift_dashboard.save("data_drift_test.html")

In [9]:
data_drift_profile = Profile(sections = [DataDriftProfileSection()])

In [10]:
data_drift_profile.calculate(reference_data = train_set , current_data = test_set)

In [11]:
report = data_drift_profile.json()

In [13]:
import json
json_report = json.loads(report)
json_report

{'data_drift': {'name': 'data_drift',
  'datetime': '2025-08-29 14:48:19.769998',
  'data': {'utility_columns': {'date': None,
    'id': None,
    'target': None,
    'prediction': None},
   'num_feature_names': ['alcohol',
    'chlorides',
    'citric acid',
    'density',
    'fixed acidity',
    'free sulfur dioxide',
    'pH',
    'quality',
    'residual sugar',
    'sulphates',
    'total sulfur dioxide',
    'volatile acidity'],
   'cat_feature_names': ['wine type'],
   'text_feature_names': [],
   'datetime_feature_names': [],
   'target_names': None,
   'options': {'confidence': None,
    'drift_share': 0.5,
    'nbinsx': 10,
    'xbins': None},
   'metrics': {'n_features': 13,
    'n_drifted_features': 0,
    'share_drifted_features': 0.0,
    'dataset_drift': False,
    'alcohol': {'current_small_hist': {'x': [8.4,
       8.98,
       9.56,
       10.14,
       10.719999999999999,
       11.3,
       11.879999999999999,
       12.459999999999999,
       13.04,
       13.62,


In [14]:
n_features = json_report["data_drift"]["data"]["metrics"]["n_features"]
n_drifted_features = json_report["data_drift"]["data"]["metrics"]["n_drifted_features"]
data_drift_percentage = (n_drifted_features / n_features) * 100

data_drift_percentage

0.0

In [15]:
drift_status = json_report["data_drift"]["data"]["metrics"]["dataset_drift"]
drift_status

False

### No Data Drift detected. Now do it in modular way.

In [16]:
train_set.columns

Index(['wine type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')