In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

%matplotlib inline

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None) 

%load_ext autoreload
%autoreload 2
import logging
logging.basicConfig(level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')

In [3]:
classes = ['DOG','CAT']

In [4]:
df_1 = pd.DataFrame({
    'A': np.random.choice(classes, 70, p=[0.8, 0.2]),
    'B': np.random.choice(classes, 70, p=[0.8, 0.2])
})

In [5]:
df_2 = pd.DataFrame({
    'A': np.random.choice(classes, 30, p=[0.7, 0.3]),
    'B': np.random.choice(classes, 30, p=[0.3, 0.7])
})

In [6]:
df = pd.concat([df_1, df_2])

In [7]:
df

Unnamed: 0,A,B
0,CAT,DOG
1,DOG,DOG
2,CAT,DOG
3,DOG,DOG
4,CAT,CAT
...,...,...
25,DOG,CAT
26,DOG,CAT
27,DOG,CAT
28,CAT,DOG


In [8]:
train, test = np.split(df, [int(0.7*len(df))])

In [9]:
def generate_temp_dataframe(train, test):
    train, test = train.copy(), test.copy()
    
    train = train.assign(type_df =  'train')
    test = test.assign(type_df = 'test')
    
    return pd.concat([train, test])

In [10]:
from scipy.stats import chi2_contingency


def chi_square_test(df):
    cat_cols = [col for col in df.select_dtypes(['object','category'])]
    cat_cols.remove('type_df')
    drift_cols = []
    
    for col in cat_cols:
        c, p, dof, expected = chi2_contingency(pd.crosstab(df[col], df['type_df']))
        
        if p < 0.05:
            drift_cols.append(col)
            
    return drift_cols

In [11]:
df_temp = generate_dataframe(train, test)

In [12]:
chi_square_test(df_temp)

['B']

In [None]:
def remove_drifted_categoricals(df, cols):
    return df.drop(columns=cols)

----

# Numerical drift (based on MLBox)

In [16]:
from mlbox.preprocessing import Reader, Drift_thresholder

In [24]:
paths = ["../data/train.csv","../data/test.csv"]
target_name = "Species"

In [25]:
rd = Reader(sep = ",")

In [26]:
df = rd.train_test_split(paths, target_name)


reading csv : train.csv ...
cleaning data ...
CPU time: 12.702234029769897 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 4.582151889801025 seconds

> Number of common features : 4

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 0
> Number of numerical features: 4
> Number of training samples : 104
> Number of test samples : 45

> You have no missing values on train set...

> Task : classification
Iris-setosa        35
Iris-versicolor    35
Iris-virginica     34
Name: Species, dtype: int64

encoding target ...


In [35]:
from sklearn.base import BaseEstimator, TransformerMixin

In [41]:
class NumericalDrifter(BaseEstimator, TransformerMixin):

    def __init__(self, threshold=0.6):
        self.threshold = threshold
        self.dft = Drift_thresholder(threshold=self.threshold)

    def fit(self, df):
        return self

    def transform(self, df):
        return self.dft.fit_transform(df)
    

In [37]:
num_drifter = NumericalDrifter(threshold=0.7)

In [38]:
num_drifter.fit(df)

NumericalDrifter(threshold=None)

In [40]:
num_drifter.transform(df)


computing drifts ...
CPU time: 5.7858781814575195 seconds

> Top 10 drifts

('PetalWidthCm', 0.32449072666463974)
('PetalLengthCm', 0.08370705381574939)
('SepalLengthCm', 0.017216479173000954)
('SepalWidthCm', 0.0019002736394040376)

> Deleted variables : []
> Drift coefficients dumped into directory : save


{'train':      PetalLengthCm  PetalWidthCm  SepalLengthCm  SepalWidthCm
 0              5.2           2.3            6.7           3.0
 1              1.5           0.2            5.4           3.7
 2              5.7           2.1            6.7           3.3
 3              1.5           0.4            5.4           3.4
 4              5.3           1.9            6.4           2.7
 ..             ...           ...            ...           ...
 100            1.3           0.4            5.4           3.9
 101            1.4           0.1            4.8           3.0
 102            5.0           1.9            6.3           2.5
 103            5.0           1.7            6.7           3.0
 104            1.6           0.2            5.0           3.0
 
 [104 rows x 4 columns],
 'test':     PetalLengthCm  PetalWidthCm  SepalLengthCm  SepalWidthCm
 0             5.5           1.8            6.5           3.0
 1             5.8           1.6            7.2           3.0
 2            