In [1]:
%%capture
%pip install -U 'rockfish[labs]' -f 'https://docs142.rockfish.ai/packages/index.html'

In [2]:
import pandas as pd
import numpy as np

import rockfish as rf
import rockfish.labs

In [3]:
# Create 2 random sample datasets with 2 numerical columns and 2 categorical columns
np.random.seed(42)
def generate_random_data(num_rows):
    numerical_col1 = np.random.rand(num_rows) * 100
    numerical_col2 = np.random.randn(num_rows) * 10
    categorical_col1 = np.random.choice(['A', 'B', 'C'], num_rows)
    categorical_col2 = np.random.choice(['X', 'Y', 'Z'], num_rows)

    data = {
        'numerical_1': numerical_col1,
        'numerical_2': numerical_col2,
        'categorical_1': categorical_col1,
        'categorical_2': categorical_col2
    }

    return pd.DataFrame(data)

dataset1 = rf.Dataset.from_pandas("sample1", generate_random_data(100))
dataset2 = rf.Dataset.from_pandas("sample2", generate_random_data(150))

In [4]:
# check dataset sample 1
dataset1.to_pandas().head()

Unnamed: 0,numerical_1,numerical_2,categorical_1,categorical_2
0,37.454012,0.870471,B,Z
1,95.071431,-2.990074,A,Y
2,73.199394,0.917608,B,X
3,59.865848,-19.875689,A,Y
4,15.601864,-2.196719,C,Z


In [5]:
# check dataset sample 2
dataset2.to_pandas().head()

Unnamed: 0,numerical_1,numerical_2,categorical_1,categorical_2
0,6.93613,2.954775,A,X
1,10.0778,-10.973459,C,Z
2,1.822183,3.409222,C,Y
3,9.444296,-0.864135,A,X
4,68.300677,2.756352,B,Z


#### Range coverage
measure the range coverage similarity on the selected numerical field between two datasets

In [6]:
# range coverage
rf.labs.metrics.range_coverage(dataset1, dataset2, "numerical_1")

0.9945835287232639

#### Category coverage
measure the category coverage similarity on the selected categorical field between two datasets

In [7]:
# category coverage
rf.labs.metrics.category_coverage(dataset1, dataset2, "categorical_1")

1.0

#### Total Variation Distance
measures the extent of difference between the probability distributions of a selected categorical field across two datasets

In [8]:
rf.labs.metrics.tv_distance(dataset1, dataset2, "categorical_1")

1.0

#### Jensen-Shannon distance
measures the similarity between two probability distributions of the selected categorical field(s) between two datasets

In [9]:
# on one selected field
rf.labs.metrics.jsd(dataset1, dataset2, ["categorical_1"])

0.04591339884444373

In [10]:
# on multiple selected fields
rf.labs.metrics.jsd(dataset1, dataset2, ["categorical_1", "categorical_2"])

0.10046244340089014

#### Wasserstein distance
measures the similarity between two probability distributions of a selected numerical field between two datasets

In [11]:
rf.labs.metrics.emd(dataset1, dataset2, ["numerical_1"])

6.263540147000409

#### Kolmogorov-Smirnov distance
measures the maximum difference between the cumulative distributions of a selected numerical field between two datasets

In [12]:
rf.labs.metrics.ks_distance(dataset1, dataset2, "numerical_1")

0.05666666666666664