## Label Drift

In [1]:
import numpy as np
import pandas as pd

from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import LabelDrift

### Carregamento dos dados de 2017 e 2021

In [5]:
enade_treino = pd.read_csv("../concept_drift/tabela_final_2017_treinamento.csv")
enade_teste = pd.read_csv("../concept_drift/tabela_final_2021_treinamento.csv")

In [6]:
train_dataset = Dataset(enade_treino, label='Nota_Conceito_Faixa', cat_features=[])
test_dataset = Dataset(enade_teste, label='Nota_Conceito_Faixa', cat_features=[])

### Uso do método LabelDrift da biblioteca

In [7]:
check = LabelDrift()
result = check.run(train_dataset=train_dataset, test_dataset=test_dataset)

In [8]:
result

VBox(children=(HTML(value='<h4><b>Label Drift</b></h4>'), HTML(value='<p>    Calculate label drift between tra…

## Feature Drift

In [2]:
from deepchecks.tabular.checks import FeatureDrift
from sklearn.svm import SVC

### Carregamento dos dados de 2017 e 2021

In [3]:
enade_treino = pd.read_csv("../concept_drift/tabela_final_2017_treinamento.csv")

numero_caracteristicas = enade_treino.shape[1] - 1
X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
y_enade_treino = enade_treino.iloc[:, -1]

enade_teste = pd.read_csv("../concept_drift/tabela_final_2021_treinamento.csv")

In [4]:
train_dataset = Dataset(enade_treino, label='Nota_Conceito_Faixa', cat_features=[])
test_dataset = Dataset(enade_teste, label='Nota_Conceito_Faixa', cat_features=[])

### Uso do método FeatureDrift da biblioteca

#### Instanciação do modelo de preferência

In [9]:
model = SVC(probability=True)
model.fit(X_enade_treino, y_enade_treino)

#### Uso do método

In [10]:
check = FeatureDrift()
result = check.run(train_dataset=train_dataset, test_dataset=test_dataset, model=model, feature_importance_force_permutation=True)

Skipping permutation importance calculation: calculation was projected to finish in 2443 seconds, but timeout was configured to 120 seconds


In [11]:
result.value

{'Numero_Notas_Invalidas': {'Drift score': 0.025273361383127857,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'Numero_Faltantes': {'Drift score': 0.21466036166859048,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'Numero_Participantes': {'Drift score': 0.2159883829169304,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'nulos_UF_Ensino_Medio': {'Drift score': 0.06870462076925937,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'RO': {'Drift score': 0.009352000099291269,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'AC': {'Drift score': 0.006131238286728458,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'AM': {'Drift score': 0.010171153392659837,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'RR': {'Drift score': 0.006295068945402216,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'PA': {'Drift score': 0.017740874508197724,
  'Method': 'Kolmogorov-Smirnov',
  'Importance': None},
 'AP': {'Drift 

In [12]:
result.show()

VBox(children=(HTML(value='<h4><b>Feature Drift</b></h4>'), HTML(value='<p>    Calculate drift between train d…

## Multivariate Drift

In [13]:
from deepchecks.tabular.checks import MultivariateDrift

In [14]:
enade_treino = pd.read_csv("../concept_drift/tabela_final_2017_treinamento.csv")

numero_caracteristicas = enade_treino.shape[1] - 1
X_enade_treino = enade_treino.iloc[:, 0:numero_caracteristicas]
y_enade_treino = enade_treino.iloc[:, -1]

enade_teste = pd.read_csv("../concept_drift/tabela_final_2021_treinamento.csv")

In [15]:
train_dataset = Dataset(enade_treino, label='Nota_Conceito_Faixa', cat_features=[])
test_dataset = Dataset(enade_teste, label='Nota_Conceito_Faixa', cat_features=[])

In [16]:
check = MultivariateDrift()
result = check.run(train_dataset=train_dataset, test_dataset=test_dataset)

In [None]:
result.value

In [110]:
result.show()

VBox(children=(HTML(value='<h4><b>Multivariate Drift</b></h4>'), HTML(value='<p>    Calculate drift between th…