In [1]:
%%capture
%pip install -U 'rockfish[labs]' -f 'https://docs142.rockfish.ai/packages/index.html'

In [2]:
import pandas as pd
import random

import rockfish as rf
import rockfish.labs

## Marginal Distribution Score

#### Tabular dataset<a class="anchor" id="tab_marginal_dist_score"></a>

In [3]:
def generate_tabular_data(num_rows, rng):
    numerical_col1 = [rng.uniform(1, 100) for _ in range(num_rows)]
    numerical_col2 = [rng.random() * 10 for _ in range(num_rows)]
    categorical_col1 = rng.choices(["A", "B", "C"], weights=[1, 1, 2], k=num_rows)
    categorical_col2 = rng.choices(["X", "Y", "Z"], weights=[1, 2, 3], k=num_rows)

    data = {
        "numerical_col1": numerical_col1,
        "numerical_col2": numerical_col2,
        "categorical_col1": categorical_col1,
        "categorical_col2": categorical_col2,
    }
    return pd.DataFrame(data)


rng = random.Random(42)
data = rf.Dataset.from_pandas("sample1", generate_tabular_data(100, rng))
syn = rf.Dataset.from_pandas("sample2", generate_tabular_data(100, rng))

In [4]:
# check sample 1
data.to_pandas()

Unnamed: 0,numerical_col1,numerical_col2,categorical_col1,categorical_col2
0,64.303253,0.114810,C,Z
1,3.476065,7.207218,C,Z
2,28.227903,6.817104,C,Z
3,23.097863,5.369703,A,Z
4,73.910650,2.668252,B,X
...,...,...,...,...
95,38.780309,8.613491,C,Y
96,99.616017,5.503253,B,Y
97,53.382320,0.505883,A,Z
98,97.136759,9.992825,A,Z


In [5]:
# check sample 2
syn.to_pandas()

Unnamed: 0,numerical_col1,numerical_col2,categorical_col1,categorical_col2
0,44.773820,6.222570,A,Z
1,22.156439,0.269665,C,Z
2,47.845432,3.940203,B,Z
3,90.216902,5.643920,C,X
4,79.806451,0.271020,C,X
...,...,...,...,...
95,68.235047,8.803200,C,Z
96,41.027350,7.011838,C,Y
97,17.339428,2.762686,C,Z
98,47.271625,0.101511,B,Z


In [6]:
# get the default weighted average score on marginal distribution
rf.labs.metrics.marginal_dist_score(dataset=data, syn=syn)

0.9299999999999999

In [7]:
# update new weights
new_weights_dict = {"numerical_col2": 0, "categorical_col1": 10, "categorical_col2": 1}
rf.labs.metrics.marginal_dist_score(dataset=data, syn=syn, weights=new_weights_dict)

0.9550000000000001

#### Time series dataset<a class="anchor" id="time_marginal_dist_score"></a>

In [8]:
def generate_timeseries_data(num_rows, rng):
    metadata = rng.choices(["A", "B", "C"], weights=[1, 1, 2], k=num_rows)
    timestamp = pd.date_range(start="1/1/2020", periods=num_rows, freq="D")
    numerical_col = [rng.uniform(1, 100) for _ in range(num_rows)]
    categorical_col = rng.choices(["a", "b", "c"], weights=[1, 1, 2], k=num_rows)

    data = {
        "metadata_field": metadata,
        "timestamp_field": timestamp,
        "numerical_field": numerical_col,
        "categorical_field": categorical_col,
    }

    return pd.DataFrame(data)

rng = random.Random(42)
ts_data = rf.Dataset.from_pandas("sample1", generate_timeseries_data(100, rng))
ts_syn = rf.Dataset.from_pandas("sample2", generate_timeseries_data(100, rng))

In [9]:
# check sample 1
ts_data.to_pandas()

Unnamed: 0,metadata_field,timestamp_field,numerical_field,categorical_field
0,C,2020-01-01,2.136621,c
1,A,2020-01-02,72.351460,c
2,B,2020-01-03,68.489327,c
3,A,2020-01-04,54.160063,a
4,C,2020-01-05,27.415694,b
...,...,...,...,...
95,B,2020-04-05,86.273561,c
96,C,2020-04-06,55.482206,b
97,C,2020-04-07,6.008245,a
98,C,2020-04-08,99.928964,a


In [10]:
# check sample 2
ts_syn.to_pandas()

Unnamed: 0,metadata_field,timestamp_field,numerical_field,categorical_field
0,C,2020-01-01,44.773820,c
1,C,2020-01-02,22.156439,a
2,C,2020-01-03,47.845432,b
3,C,2020-01-04,90.216902,c
4,A,2020-01-05,79.806451,a
...,...,...,...,...
95,B,2020-04-05,68.235047,c
96,B,2020-04-06,41.027350,c
97,C,2020-04-07,17.339428,b
98,C,2020-04-08,47.271625,a


In [11]:
# get the default weighted average score on marginal distribution
rf.labs.metrics.marginal_dist_score(
    dataset=ts_data,
    syn=ts_syn,
    metadata=["metadata_field"],
    other_categorical=["categorical_field"],
)

0.8850859106529209

In [12]:
# update new weights
new_weights_dict = {"interarrival": 0, "categorical_field": 10, "session_length": 2}
rf.labs.metrics.marginal_dist_score(
    dataset=ts_data,
    syn=ts_syn,
    metadata=["metadata_field"],
    other_categorical=["categorical_field"],
    weights=new_weights_dict,
)

0.9380952380952382

### Correlation Score

In [13]:
def generate_data(num_rows, rng):
    numerical_1 = [rng.uniform(1, 100) for _ in range(num_rows)]
    numerical_2 = [rng.random() * 10 for _ in range(num_rows)]
    numerical_3 = [rng.random() for _ in range(num_rows)]

    data = {
        "numerical_1": numerical_1,
        "numerical_2": numerical_2,
        "numerical_3": numerical_3,
    }
    return pd.DataFrame(data)


rng = random.Random(42)
data = rf.Dataset.from_pandas("sample1", generate_data(10_000, rng))
syn = rf.Dataset.from_pandas("sample2", generate_data(10_000, rng))

In [14]:
# correlation score on selected numerical fields
selected_fields = ["numerical_1", "numerical_2", "numerical_3"]
rf.labs.metrics.correlation_score(data, syn, selected_fields)

0.9770364935005559

### Association Score

In [15]:
def generate_data(num_rows, rng):
    categorical_1 = rng.choices(["A", "B", "C"], weights=[1, 1, 2], k=num_rows)
    categorical_2 = rng.choices(["X", "Y", "Z"], weights=[1, 2, 3], k=num_rows)
    categorical_3 = rng.choices(["1", "2", "3"], weights=[1, 2, 3], k=num_rows)

    data = {
        "categorical_1": categorical_1,
        "categorical_2": categorical_2,
        "categorical_3": categorical_3
    }
    return pd.DataFrame(data)


rng = random.Random(42)
data = rf.Dataset.from_pandas("sample1", generate_data(10_000, rng))
syn = rf.Dataset.from_pandas("sample2", generate_data(10_000, rng))

In [16]:
# association score on selected categorical columns
selected_fields = ["categorical_1", "categorical_2", "categorical_3"]
rf.labs.metrics.association_score(data, syn, selected_fields)

0.9874013933658707