Professor Sourav Chatterjee's xicor coefficient of correlation (<a href="https://win-vector.com/2021/12/26/how-to-read-sourav-chatterjees-basic-xicor-defenition/">our note</a>, <a href="https://doi.org/10.1080/01621459.2020.1758115">JASA</a>; original sources: <a href="https://CRAN.R-project.org/package=XICOR">R package</a>, <a href="https://arxiv.org/abs/1909.10140">Arxiv</a>, <a href="https://news.ycombinator.com/item?id=29687613">Hacker News</a>, and <a href="https://github.com/czbiohub/xicor">a Python package</a> (different author)).

In [1]:
import numpy as np
import pandas as pd
from data_algebra.data_ops import *
import yaml


In [2]:
def xicor_query(*, x_name: str = 'x', y_name: str = 'y'):
    """
    Build a query computing the xicor of y_name as a function of x_name.
    Ref: https://arxiv.org/abs/1909.10140

    xicor(x, y) : 1 - n sum(i = 0, n-2) |r(i+1) - r(i)| / (2 * sum(i=0, n-1) l(i) (n - l(i)),
    where r(i) is the rank of the i-th Y item when ordered by x, and l(i) is the reverse rank of
    the l-th Y item.

    :param x_name: name for explanatory variable column.
    :param y_name: name for dependent variable column.
    :return: data algebra query computing xicor.
    """
    assert isinstance(x_name, str)
    assert isinstance(y_name, str)
    x_tie_breaker = x_name + "_tie_breaker"
    y_str = y_name + "_str"
    names = [
        x_name, y_name, x_tie_breaker, y_str,
        'l', 'n', 'r',
        'rplus', 'rdiff', 'lterm', 'num_sum', 'den_sum',
        'xicor'
        ]
    assert(len(names) == len(set(names)))
    ops = (
        TableDescription(column_names=[x_name, y_name])
            .extend({y_str: f'{y_name}.as_str()'})
            .extend(  # convert types, and add in tie breaking column
                {
                    x_name: f'1.0 * {x_name}',
                    y_name: f'1.0 * {y_name}',
                    x_tie_breaker: '_uniform()',
                })
            .extend({'n': '(1).sum()'})  # annotate in number of rows
            .extend(
                {'r': '(1).cumsum()'},  # compute y ranks, that we will use to compare rank changes wrt x
                order_by=[y_name])
            .extend(
                {'l': '(1).cumsum()'},  # compute reverse y ranks, used to normalize for ties in denominator
                order_by=[y_name],
                reverse=[y_name])
            .extend(  # go to max rank of group tie breaking
                {
                    'l': 'l.max()',
                    'r': 'r.max()',
                },
                partition_by=[y_str],
                )
            .extend(  # get y rank and y rank of next x-item into same row so we can take a differnce
                {'rplus': 'r.shift()'},
                order_by=[x_name, x_tie_breaker],
                reverse=[x_name, x_tie_breaker])
            .extend(  # compute numerator and denominator terms
                {
                    'rdiff': '(rplus - r).abs().coalesce_0()',
                    'lterm': 'l * (n - l)',
                })
            .project(  # aggregate to compute sums in xicor definition
                {
                    'num_sum': 'rdiff.sum()',
                    'den_sum': 'lterm.sum()',
                    'n': 'n.max()',  # pseudo-aggregation, column is constant
                }
                )
            .extend({'xicor': '1.0 - n * num_sum / ( 2.0 * den_sum)'})  # actual xicor formula
            .select_columns('xicor')
    )
    return ops


In [3]:
x_y_ops = xicor_query(x_name='x', y_name='y')

def xicor(x, y):
    """
    Compute xicor of y treated as a function of x.

    :param x: vector of explanatory variable values.
    :param y: vector of dependent variable values.
    :return: xicor score (floating point number).
    """

    res_frame = x_y_ops.transform(pd.DataFrame({'x': x, 'y': y}))
    return res_frame['xicor'].values[0]


In [4]:
xicor([1, 2, 3], [1, 2, 3])

0.25

In [5]:
xicor([1, 2, 3], [3, 2, 1])

0.25

In [6]:
xicor([1, 2, 3], [1, 3, 2])

-0.125

In [7]:
with open("examples.yaml", "r") as in_f:
   examples = yaml.safe_load(in_f)

In [8]:
for example in examples:
    a = example['a']
    b = example['b']
    ref_xicor = example['xicor']
    our_xicor = [xicor(a, b) for i in range(len(ref_xicor))]
    assert np.abs(np.mean(ref_xicor) - np.mean(our_xicor)) < 0.05
    assert np.abs(np.std(ref_xicor) - np.std(our_xicor)) < 0.05
    print(f'ref: {np.mean(ref_xicor)} {np.std(ref_xicor)}, ours: {np.mean(our_xicor)} {np.std(our_xicor)}')

ref: -0.0676692 0.0, ours: -0.06766917293233089 1.3877787807814457e-17
ref: -0.007518800000000003 3.469446951953614e-18, ours: -0.007518796992481258 0.0
ref: 2.2204459999999992e-16 7.395570986446986e-32, ours: 0.0 0.0
ref: -0.18796989999999994 5.551115123125783e-17, ours: -0.18796992481203012 0.0
ref: 0.135514 0.0, ours: 0.1355140186915888 5.551115123125783e-17
ref: 0.003533834180000004 0.06878326849618019, ours: 0.005639097744360889 0.06476102698102339
ref: -0.06844740000000002 1.3877787807814457e-17, ours: -0.06844741235392317 1.3877787807814457e-17
ref: -0.12718959999999996 5.551115123125783e-17, ours: -0.12718964204112715 0.0
ref: 0.04385151299999999 0.08295654197477093, ours: 0.05305491105955143 0.08999555737058786
ref: -0.12030080000000005 5.551115123125783e-17, ours: -0.12030075187969928 4.163336342344337e-17
ref: -0.042562927 0.11081480834838983, ours: -0.0505720823798627 0.11107165325749958
ref: -0.01849624 0.07992888705673187, ours: -0.015263157894736859 0.08054211003255178
r

In [9]:
print('done')

done
