## Data Validation Experiment

In [1]:
import pandas as pd
from evidently.dashboard import Dashboard
from evidently.tabs import DataDriftTab, CatTargetDriftTab
from evidently.model_profile import Profile
from evidently.profile_sections import DataDriftProfileSection



In [2]:
df = pd.read_csv("visadataset.csv")
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [3]:
df.drop(columns = ['case_id'] , axis = 1 , inplace = True)

In [4]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [5]:
X = df.drop(columns = ['case_status'] , axis = 1)
y = df['case_status']
X.shape

(25480, 10)

In [6]:
from sklearn.model_selection import train_test_split

train_set , test_set = train_test_split(df , test_size = 0.2 , random_state = 42)

In [7]:
train_set.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
2403,Asia,High School,Y,Y,1411,2009,West,75107.37,Year,Y,Denied
6830,North America,Master's,Y,N,781,2012,West,102403.56,Year,Y,Denied
878,Asia,High School,Y,N,958,2005,South,89991.95,Year,Y,Certified
24061,Asia,Master's,N,N,900,1962,West,128104.61,Year,Y,Denied
2109,Europe,High School,N,N,3533,1993,West,12647.14,Year,Y,Denied


In [8]:
data_drift_dashboard = Dashboard(tabs = [DataDriftTab()])

In [9]:
data_drift_dashboard.calculate(train_set , test_set)

In [10]:
data_drift_dashboard.show()

In [19]:
data_drift_dashboard.save("data_drift_test.html")

In [12]:
data_drift_profile = Profile(sections=[DataDriftProfileSection()])

In [13]:
data_drift_profile.calculate(train_set, test_set)

In [14]:
report = data_drift_profile.json()

In [15]:
report

'{"data_drift": {"name": "data_drift", "datetime": "2025-08-16 13:00:20.912078", "data": {"utility_columns": {"date": null, "id": null, "target": null, "prediction": null}, "num_feature_names": ["no_of_employees", "prevailing_wage", "yr_of_estab"], "cat_feature_names": ["case_status", "continent", "education_of_employee", "full_time_position", "has_job_experience", "region_of_employment", "requires_job_training", "unit_of_wage"], "text_feature_names": [], "datetime_feature_names": [], "target_names": null, "options": {"confidence": null, "drift_share": 0.5, "nbinsx": 10, "xbins": null}, "metrics": {"n_features": 11, "n_drifted_features": 0, "share_drifted_features": 0.0, "dataset_drift": false, "no_of_employees": {"current_small_hist": {"x": [-26.0, 60183.5, 120393.0, 180602.5, 240812.0, 301021.5, 361231.0, 421440.5, 481650.0, 541859.5, 602069.0], "y": [1.6276240483858407e-05, 2.3140029522505947e-07, 5.8664863578184096e-08, 1.629579543838447e-08, 9.777477263030682e-09, 6.51831817535378

In [16]:
import json

json_report = json.loads(report)
json_report

{'data_drift': {'name': 'data_drift',
  'datetime': '2025-08-16 13:00:20.912078',
  'data': {'utility_columns': {'date': None,
    'id': None,
    'target': None,
    'prediction': None},
   'num_feature_names': ['no_of_employees', 'prevailing_wage', 'yr_of_estab'],
   'cat_feature_names': ['case_status',
    'continent',
    'education_of_employee',
    'full_time_position',
    'has_job_experience',
    'region_of_employment',
    'requires_job_training',
    'unit_of_wage'],
   'text_feature_names': [],
   'datetime_feature_names': [],
   'target_names': None,
   'options': {'confidence': None,
    'drift_share': 0.5,
    'nbinsx': 10,
    'xbins': None},
   'metrics': {'n_features': 11,
    'n_drifted_features': 0,
    'share_drifted_features': 0.0,
    'dataset_drift': False,
    'no_of_employees': {'current_small_hist': {'x': [-26.0,
       60183.5,
       120393.0,
       180602.5,
       240812.0,
       301021.5,
       361231.0,
       421440.5,
       481650.0,
       541859

In [17]:
n_features = json_report["data_drift"]["data"]["metrics"]["n_features"]
n_drifted_features = json_report["data_drift"]["data"]["metrics"]["n_drifted_features"]
data_drift_percentage = (n_drifted_features / n_features) * 100

data_drift_percentage

0.0

In [18]:
drift_status = json_report["data_drift"]["data"]["metrics"]["dataset_drift"]
drift_status

False