Professor Sourav Chatterjee's xicor coefficient of correlation (<a href="https://win-vector.com/2021/12/29/exploring-the-xi-correlation-coefficient/">Nina Zumel's tutorial</a>, <a href="https://doi.org/10.1080/01621459.2020.1758115">JASA</a>; original sources: <a href="https://CRAN.R-project.org/package=XICOR">R package</a>, <a href="https://arxiv.org/abs/1909.10140">Arxiv</a>, <a href="https://news.ycombinator.com/item?id=29687613">Hacker News</a>, and <a href="https://github.com/czbiohub/xicor">a Python package</a> (different author)).

In [1]:
from typing import Iterable
import numpy as np
import pandas
import pandas as pd
from data_algebra.data_ops import descr, TableDescription, ViewRepresentation
import data_algebra.BigQuery
import yaml


In [2]:
def xicor_query(
        data: ViewRepresentation,
        *,
        x_name: str = 'x',
        y_name: str = 'y',
        var_keys: Iterable[str] = tuple()):
    """
    Build a query computing the xicor of y_name as a function of x_name for each var_keys group of rows.
    Ref: https://arxiv.org/abs/1909.10140

    xicor(x, y) : 1 - n sum(i = 0, n-2) |r(i+1) - r(i)| / (2 * sum(i=0, n-1) l(i) (n - l(i)),
    where r(i) is the rank of the i-th Y item when ordered by x, and l(i) is the reverse rank of
    the l-th Y item.

    :param x_name: name for explanatory variable column.
    :param y_name: name for dependent variable column.
    :param var_keys: list of names for variable id columns.
    :param rep_id: name for repetition id column.
    :return: data algebra query computing xicor.
    """
    assert isinstance(x_name, str)
    assert isinstance(y_name, str)
    assert not isinstance(var_keys, str)
    var_keys = list(var_keys)
    x_tie_breaker = x_name + "_tie_breaker"
    y_group = y_name + "_group"
    names = [
        x_name, y_name, x_tie_breaker, y_group,
        'l', 'n', 'r',
        'rplus', 'rdiff', 'lterm', 'num_sum', 'den_sum',
        'xicor'
        ] + var_keys
    assert(len(names) == len(set(names)))
    ops = (
        data
            .extend({y_group: f"{y_name}.as_str()"})  # Google BigQuery won't group by float
            .extend({    # convert types, and add in tie breaking column
                x_name: f"1.0 * {x_name}",
                y_name: f"1.0 * {y_name}",
                x_tie_breaker: "_uniform()"})
            .extend(
                {"n": "(1).sum()"}, partition_by=var_keys)  # annotate in number of rows
            .extend(  # compute y ranks, that we will use to compare rank changes wrt x
                {"r": "(1).cumsum()"}, order_by=[y_name], partition_by=var_keys)
            .extend(  # compute reverse y ranks, used to normalize for ties in denominator
                {"l": "(1).cumsum()"}, order_by=[y_name], reverse=[y_name], partition_by=var_keys)
            .extend(  # go to max rank of group tie breaking
                {"l": "l.max()", "r": "r.max()"}, partition_by=[y_group] + var_keys)
            .extend(  # get y rank and y rank of next x-item into same row so we can take a difference
                {"rplus": "r.shift(1)"},
                order_by=[x_name, x_tie_breaker],
                reverse=[x_name, x_tie_breaker],
                partition_by=var_keys,
                )
            .extend(  # compute numerator and denominator terms
                {"rdiff": "((rplus - r).abs()).coalesce(0)", "lterm": "l * (n - l)"})
            .project(   # aggregate to compute sums in xicor definition
                {"num_sum": "rdiff.sum()", "den_sum": "lterm.sum()",
                 "n": "n.max()"  # pseudo-aggregation n is constant across rows
                 },
                group_by=var_keys,
                )
            .extend(  # apply actual xicor formula
                {"xicor": "1.0 - ((n * num_sum) / (2.0 * den_sum))"})
            .select_columns(var_keys + ["xicor"])
        )
    return ops

In [3]:
x_y_ops = xicor_query(TableDescription(table_name='df', column_names=['x', 'y']))

def xicor(x, y):
    """
    Compute xicor of y treated as a function of x.

    :param x: vector of explanatory variable values.
    :param y: vector of dependent variable values.
    :return: xicor score (floating point number).
    """

    res_frame = x_y_ops.transform(pd.DataFrame({'x': x, 'y': y}))
    return res_frame['xicor'].values[0]


In [4]:
x1 = xicor([1, 2, 3], [1, 2, 3])  # expect 0.25
assert x1 == 0.25
x1

0.25

In [5]:
x2 = xicor([1, 2, 3], [3, 2, 1])  # expect 0.25
assert x2 == 0.25
x2

0.25

In [6]:
x3 = xicor([1, 2, 3], [1, 3, 2])  # expect -0.125
assert x3 == -0.125
x3

-0.125

In [7]:
with open("examples.yaml", "r") as in_f:
   examples = yaml.safe_load(in_f)

In [8]:
def example_to_frame(ei):
    "encode an example into a data frame"
    example = examples[ei]
    a = example['a']
    b = example['b']
    return pd.DataFrame({'x': a, 'y': b, 'vname': f'v_{ei}'})

example_frames = [example_to_frame(ei) for ei in range(len(examples))]
example_frames = pd.concat(example_frames).reset_index(drop=True, inplace=False)

example_frames

Unnamed: 0,x,y,vname
0,0.561595,0.084883,v_0
1,0.035700,0.635105,v_0
2,0.748017,0.133560,v_0
3,0.663603,0.577371,v_0
4,0.074035,0.214352,v_0
...,...,...,...
995,0.360310,5.000000,v_49
996,0.413419,2.000000,v_49
997,0.224839,4.000000,v_49
998,0.910356,10.000000,v_49


In [9]:
rep_frame = pd.DataFrame({
    'rep': range(100)
})


In [10]:
grouped_calc = (
    xicor_query(
            descr(d=example_frames)
                .natural_join(  # cross join rows to get experiment repetitions
                    b=descr(rep_frame=rep_frame),
                    by=[],
                    jointype='cross',
                ),
            var_keys=['vname', 'rep'])
        .project({
            'xicor_mean': 'xicor.mean()',
            'xicor_std': 'xicor.std()',
            },
            group_by=['vname'])
        .order_rows(['vname'])
)
xicor_results = grouped_calc.eval({'d': example_frames, 'rep_frame': rep_frame})

xicor_results


Unnamed: 0,vname,xicor_mean,xicor_std
0,v_0,-0.067669,0.0
1,v_1,-0.007519,0.0
2,v_10,-0.032418,0.102718
3,v_11,-0.010075,0.081488
4,v_12,-0.132339,0.119049
5,v_13,0.069973,0.0
6,v_14,-0.058797,0.080827
7,v_15,0.195052,0.091931
8,v_16,-0.015911,0.099254
9,v_17,-0.05315,0.105742


In [11]:
# compare results
def compare_res(xicor_results_to_check):
    for ei in range(len(examples)):
        example = examples[ei]
        a = example['a']
        b = example['b']
        ref_xicor = example['xicor']
        our_result = xicor_results_to_check.loc[xicor_results_to_check['vname'] == f'v_{ei}', :]
        our_xicor_mean = our_result['xicor_mean'].values[0]
        our_xicor_std = our_result['xicor_std'].values[0]
        assert np.abs(np.mean(ref_xicor) - our_xicor_mean) < 0.05
        assert np.abs(np.std(ref_xicor) - our_xicor_std) < 0.05
        print(f'ref: {np.mean(ref_xicor)} {np.std(ref_xicor)}, ours: {our_xicor_mean} {our_xicor_std}')

compare_res(xicor_results)

ref: -0.0676692 0.0, ours: -0.06766917293233088 0.0
ref: -0.007518800000000003 3.469446951953614e-18, ours: -0.007518796992481258 0.0
ref: 2.2204459999999992e-16 7.395570986446986e-32, ours: 0.0 0.0
ref: -0.18796989999999994 5.551115123125783e-17, ours: -0.18796992481203012 0.0
ref: 0.135514 0.0, ours: 0.13551401869158874 0.0
ref: 0.003533834180000004 0.06878326849618019, ours: 0.0012030075187969846 0.07103049688516429
ref: -0.06844740000000002 1.3877787807814457e-17, ours: -0.06844741235392315 0.0
ref: -0.12718959999999996 5.551115123125783e-17, ours: -0.12718964204112715 0.0
ref: 0.04385151299999999 0.08295654197477093, ours: 0.03665893271461717 0.07875861035584722
ref: -0.12030080000000005 5.551115123125783e-17, ours: -0.12030075187969924 0.0
ref: -0.042562927 0.11081480834838983, ours: -0.03241800152555301 0.10271768019420667
ref: -0.01849624 0.07992888705673187, ours: -0.010075187969924819 0.08148759304024
ref: -0.12621664900000001 0.11045203602378319, ours: -0.13233908948194661 0

In [12]:
# try it in database
db_handle = data_algebra.BigQuery.example_handle()
db_handle.insert_table(example_frames, table_name='d', allow_overwrite=True)
db_handle.insert_table(rep_frame, table_name='rep_frame', allow_overwrite=True)

(TableDescription(table_name="rep_frame", column_names=["rep"]))

In [13]:
db_res = db_handle.read_query(grouped_calc)

In [14]:
compare_res(db_res)

ref: -0.0676692 0.0, ours: -0.06766917293233088 0.0
ref: -0.007518800000000003 3.469446951953614e-18, ours: -0.007518796992481258 0.0
ref: 2.2204459999999992e-16 7.395570986446986e-32, ours: 0.0 0.0
ref: -0.18796989999999994 5.551115123125783e-17, ours: -0.18796992481203012 0.0
ref: 0.135514 0.0, ours: 0.13551401869158874 0.0
ref: 0.003533834180000004 0.06878326849618019, ours: -0.012330827067669197 0.061291063734880497
ref: -0.06844740000000002 1.3877787807814457e-17, ours: -0.06844741235392315 0.0
ref: -0.12718959999999996 5.551115123125783e-17, ours: -0.12718964204112715 0.0
ref: 0.04385151299999999 0.08295654197477093, ours: 0.03774168600154681 0.07547782490743328
ref: -0.12030080000000005 5.551115123125783e-17, ours: -0.12030075187969924 0.0
ref: -0.042562927 0.11081480834838983, ours: -0.0630053394355454 0.10524392090330134
ref: -0.01849624 0.07992888705673187, ours: -0.015263157894736845 0.07920441505021177
ref: -0.12621664900000001 0.11045203602378319, ours: -0.1133437990580847

In [15]:
# show we made it to here, adn did not assert earlier
print('done')

done
