# Data drift dashboard in jupyter notebook

In [1]:
# try:
#     import evidently
# except:
#     !pip install git+https://github.com/evidentlyai/evidently.git

In [1]:
import pandas as pd
import numpy as np
import requests
import zipfile
import io

from datetime import datetime, time
from sklearn import datasets, ensemble

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, RegressionPreset

## Bicycle Demand Data

This step automatically downloads the bike dataset from UCI. This version is slightly different from the dataset used in Kaggle competition. If you want the example to be identical to the one in the Evidently blog "How to break a model in 20 days", you can manually download the dataset from Kaggle: https://www.kaggle.com/c/bike-sharing-demand/data 

In [2]:
df = pd.read_csv("train.csv")

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
df2 = pd.read_csv("test.csv")

In [15]:
df['Sex'] = np.where(df['Sex']=='female','2',df['Sex'])
df['Sex'] = np.where(df['Sex']=='male','1',df['Sex'])

## Regression Model

### Model training

In [16]:
# y_train=df['Survived']
features =['Pclass','Sex','SibSp','Parch']

X_train=pd.get_dummies(df[features])
X_test= pd.get_dummies(df2[features])

target = 'Survived'
prediction = 'prediction'
numerical_features = ['Pclass','SibSp','Parch']
categorical_features = ['Sex']

In [35]:
reference = df.iloc[0:300]
current = df.iloc[301:891]

# reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
# current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [36]:
reference.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",2,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [37]:
regressor = ensemble.RandomForestRegressor(random_state = 0, n_estimators = 50)

In [38]:
regressor.fit(reference[numerical_features + categorical_features], reference[target])

In [39]:
ref_prediction = regressor.predict(reference[numerical_features + categorical_features])
current_prediction = regressor.predict(current[numerical_features + categorical_features])

In [40]:
reference['prediction'] = ref_prediction
current['prediction'] = current_prediction

### Model Performance 

In [41]:
column_mapping = ColumnMapping()

column_mapping.target = target
column_mapping.prediction = prediction
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features

In [42]:
regression_performance = Report(metrics=[RegressionPreset()], options={"render": {"raw_data": True}})
regression_performance.run(current_data=reference, reference_data=None, column_mapping=column_mapping)

In [59]:
# regression_performance.show()

In [44]:
#regression_performance.save('reports/regression_performance_at_training.html')

##  Week 1

In [60]:
regression_performance = Report(metrics=[RegressionPreset()], options={"render": {"raw_data": True}})
regression_performance.run(current_data=current.loc[301:400], 
                          reference_data=reference,
                          column_mapping=column_mapping)

# regression_performance.show()

In [46]:
#regression_performance.save('reports/regression_performance_after_week1.html')

In [61]:
target_drift = Report(metrics=[TargetDriftPreset()], options={"render": {"raw_data": True}})
target_drift.run(current_data=current.loc[301:400],
                 reference_data=reference,
                 column_mapping=column_mapping)

# target_drift.show()

In [48]:
#target_drift.save('reports/target_drift_after_week1.html')

## Week 2

In [62]:
regression_performance = Report(metrics=[RegressionPreset()], options={"render": {"raw_data": True}})
regression_performance.run(current_data=current.loc[401:500], 
                          reference_data=reference,
                          column_mapping=column_mapping)

# regression_performance.show()

In [50]:
#regression_performance.save('reports/regression_performance_after_week2.html')

In [63]:
target_drift = Report(metrics=[TargetDriftPreset()], options={"render": {"raw_data": True}})
target_drift.run(current_data=current.loc[401:500],
                 reference_data=reference,
                 column_mapping=column_mapping)

# target_drift.show()

In [52]:
#target_drift.save('reports/target_drift_after_week2.html')

## Week 3

In [64]:
regression_performance = Report(metrics=[RegressionPreset()], options={"render": {"raw_data": True}})
regression_performance.run(current_data=current.loc[501:829], 
                          reference_data=reference,
                          column_mapping=column_mapping)

# regression_performance.show()

In [26]:
#regression_performance.save('reports/regression_performance_after_week3.html')

In [65]:
target_drift = Report(metrics=[TargetDriftPreset()], options={"render": {"raw_data": True}})
target_drift.run(current_data=current.loc[501:829],
                 reference_data=reference,
                 column_mapping=column_mapping)

# target_drift.show()

In [55]:
#target_drift.save('reports/target_drift_after_week3.html')

## Data Drift

In [29]:
column_mapping = ColumnMapping()

column_mapping.numerical_features = numerical_features

In [66]:
data_drift = Report(metrics = [DataDriftPreset()], options={"render": {"raw_data": True}})
data_drift.run(current_data = current.loc[301:825],
               reference_data = reference,
               column_mapping=column_mapping)

# data_drift.show()

In [31]:
#data_drift.save("reports/data_drift_dashboard_after_week1.html")

# Support Evidently
Enjoyed the tutorial? Star Evidently on GitHub to contribute back! This helps us continue creating free open-source tools for the community. https://github.com/evidentlyai/evidently