In [None]:
import pandas as pd

from evidently import Dataset
from evidently import DataDefinition
from evidently import Report
from evidently.presets import DataDriftPreset

from evidently.tests.numerical_tests import TestStatus as num_test_status
from evidently.tests.categorical_tests import TestStatus as cat_test_status


In [None]:
# Load data
# Use raw versions to leverage evidently's API
train = pd.read_csv('../data/raw/train.csv',index_col=False)
val = pd.read_csv('../data/raw/val.csv',index_col=False)

In [5]:
train.columns

Index(['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GradeClass'],
      dtype='object')

In [6]:
train.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.0
1,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,4.0
2,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,3.0
3,1006,18,0,0,1,8.191219,0,0,1,1,0,0,0,1.0
4,1007,15,0,1,1,15.60168,10,0,3,0,1,0,0,2.0


In [7]:
schema = DataDefinition(
    numerical_columns=["StudyTimeWeekly","Absences"],
    categorical_columns=['Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'Tutoring', 'ParentalSupport','Extracurricular', 'Sports', 'Music',
        'Volunteering', 'GradeClass'],
    )

In [8]:
#Format as Evidently dataset objects
train_ev = Dataset.from_pandas(
    pd.DataFrame(train),
    data_definition=schema
)

val_ev = Dataset.from_pandas(
    pd.DataFrame(val),
    data_definition=schema
)

In [11]:
report = Report([
    DataDriftPreset()
],
include_tests="True")
my_eval = report.run(train_ev,val_ev)

In [21]:
print(my_eval.dict())

{'metrics': [{'id': '15e89f895b482f9b84ba7274ed18a106', 'metric_id': 'DriftedColumnsCount(drift_share=0.5)', 'value': {'count': 2.0, 'share': 0.15384615384615385}}, {'id': '00dd8ce4b56e6aae44128c0cca15d261', 'metric_id': 'ValueDrift(column=StudyTimeWeekly)', 'value': np.float64(0.39858272652622134)}, {'id': '0bc46a3cec87db56ff52decf30215a10', 'metric_id': 'ValueDrift(column=Absences)', 'value': np.float64(0.6626556080329581)}, {'id': '8f5d1c60a32d6fc1bd54bc53af61d8e8', 'metric_id': 'ValueDrift(column=Age)', 'value': np.float64(0.08569339421307885)}, {'id': '36230dcfc62097d9b2725a795000a999', 'metric_id': 'ValueDrift(column=Gender)', 'value': np.float64(0.5663586113768899)}, {'id': '0318315f1e57c8412de5b393bd387a0f', 'metric_id': 'ValueDrift(column=Ethnicity)', 'value': np.float64(0.008480662996859609)}, {'id': '7db670e70400b7e4689ac0aa335094cb', 'metric_id': 'ValueDrift(column=ParentalEducation)', 'value': np.float64(0.0002070530515027384)}, {'id': 'd8a63127d09a52824f55857944a21f0c', '

In [None]:
my_eval.dict()['tests']

14

In [57]:
num_fail =  num_test_status.FAIL
cat_fail =  cat_test_status.FAIL

In [58]:
for i in range(14):
    value = my_eval.dict()['tests'][i]['status']
    if value in [num_fail,cat_fail]:
        # Check for the GradeClass column
        if i == 13:
            print('OH NO!\n')
            print('It appears you have value drift for the dependent variable\n')
            print('Here is the log:\n')
            print(my_eval.dict()['tests'][i]['description'])
            print(' ')
            print('You should strongly consider re-generating the split\n')
            print('If not, it is possible that your model will be ineffective\n')

            continue
        
        # Check for the others
        print('Uh oh!')
        print('There seems to be a drift in a column!\n')
        print(my_eval.dict()['tests'][i]['name'])
        print(' ')
        print(my_eval.dict()['tests'][i]['description'])
        print(' ')
        print('You might want to consider re-generating the train-val-test split\n')


Uh oh!
There seems to be a drift in a column!

Value Drift for column Ethnicity
 
Drift score is 0.01. The drift detection method is chi-square p_value. The drift threshold is 0.05.
 
You might want to consider re-generating the train-val-test split

Uh oh!
There seems to be a drift in a column!

Value Drift for column ParentalEducation
 
Drift score is 0.00. The drift detection method is chi-square p_value. The drift threshold is 0.05.
 
You might want to consider re-generating the train-val-test split

