# Important data

In [4]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.model_selection import train_test_split


from evidently.dashboard import Dashboard
from evidently.tabs import NumTargetDriftTab,CatTargetDriftTab

# Target Drift

In [5]:
reference_data = pd.read_csv("training_data.csv",
                                            header=None,
                                            names=[ "day{}".format(i) for i in range(0,14) ]+["target"] )


In [6]:
production_scored_data = pd.read_csv("scored_data.csv",
                                            header=None,
                                            names=[ "day{}".format(i) for i in range(0,14) ]+["target"] )


In [7]:
production_scored_data

Unnamed: 0,day0,day1,day2,day3,day4,day5,day6,day7,day8,day9,day10,day11,day12,day13,target
0,1,1,0,0,0,0,1,1,0,0,0,1,0,0,1
1,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1
2,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0
3,0,0,0,1,1,0,0,0,1,0,0,0,1,0,1
4,0,0,1,1,0,0,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0,0,0,1,0,0,1,1,1,0,1,0,1,1,0
73,0,0,1,0,0,1,1,1,0,1,0,1,1,0,1
74,0,1,0,0,1,1,1,0,1,0,1,1,0,1,0
75,1,0,0,1,1,1,0,1,0,1,1,0,1,1,0


In [8]:
EXPERIMENT_NAME="./reports_target_drift"
mlflow.set_experiment(EXPERIMENT_NAME)
with mlflow.start_run():
    model_target_drift = Dashboard(reference_data,production_scored_data,
                               tabs=[ CatTargetDriftTab])
    model_target_drift.save(EXPERIMENT_NAME+"/target_drift.html")
    drift_dashboard._save_to_json(EXPERIMENT_NAME+"/target_drift.json")
    mlflow.log_artifacts(EXPERIMENT_NAME)

NameError: name 'mlflow' is not defined

# Model Performance

In [15]:
import xgboost as xgb
import mlflow
from evidently.tabs import ClassificationPerformanceTab

# Train model and log performance tests

In [23]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.xgboost.autolog()
EXPERIMENT_NAME="reports_model_performance"
mlflow.set_experiment(EXPERIMENT_NAME)

threshold=0.5

with mlflow.start_run() as run:
    

    X=reference_data.iloc[:,:-1]
    Y=reference_data.iloc[:,-1]


    reference, production, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=4284, stratify=Y)


    reference_train = xgb.DMatrix(reference,label=y_train)


    dproduction= xgb.DMatrix(production)
    dreference=xgb.DMatrix(reference)
    

    model=xgb.train(dtrain=reference_train,params={})

    
    train_proba_predict = model.predict(dreference)
    test_proba_predict = model.predict(dproduction)

    test_predictions = [1. if y_cont > threshold else 0. for y_cont in test_proba_predict]
    train_predictions = [1. if y_cont > threshold else 0. for y_cont in train_proba_predict]
    

    reference['target'] = y_train
    reference['prediction'] = train_predictions

    production['target'] = y_test
    production['prediction'] = test_predictions
    
    classification_performance = Dashboard(reference, production,  
                       tabs=[ClassificationPerformanceTab])
    
    classification_performance.save('.reports/'+EXPERIMENT_NAME+'.html')
    

    mlflow.log_artifact('.reports/'+EXPERIMENT_NAME+'.html')


