Professor Sourav Chatterjee's xicor coefficient of correlation (<a href="https://win-vector.com/2021/12/26/how-to-read-sourav-chatterjees-basic-xicor-defenition/">our note</a>, <a href="https://doi.org/10.1080/01621459.2020.1758115">JASA</a>; original sources: <a href="https://CRAN.R-project.org/package=XICOR">R package</a>, <a href="https://arxiv.org/abs/1909.10140">Arxiv</a>, <a href="https://news.ycombinator.com/item?id=29687613">Hacker News</a>, and <a href="https://github.com/czbiohub/xicor">a Python package</a> (different author)).

In [3]:
import pandas as pd
import numpy as np
from data_algebra.data_ops import *


def xicor_query(*, x_name: str = 'x', y_name: str = 'y'):
    """
    Build a query computing the xicor of y_name as a function of x_name.
    Ref: https://arxiv.org/abs/1909.10140

    xicor(x, y) : 1 - n sum(i = 0, n-2) |r(i+1) - r(i)| / (2 * sum(i=0, n-1) l(i) (n - l(i)),
    where r(i) is the rank of the i-th Y item when ordered by x, and l(i) is the reverse rank of
    the l-th Y item.

    :param x_name: name for explanatory variable column.
    :param y_name: name for dependent variable column.
    :return: data algebra query computing xicor.
    """
    assert isinstance(x_name, str)
    assert isinstance(y_name, str)
    x_tie_breaker = x_name + "_tie_breaker"
    y_tie_breaker = y_name + "_tie_breaker"
    assert(len({x_name, y_name, x_tie_breaker, y_tie_breaker}) == 4)
    ops = (
        TableDescription(column_names=[x_name, y_name])
            .extend(  # add in some tie breaking columns
                {
                    x_tie_breaker: '_uniform()',
                    y_tie_breaker: '_uniform()',
                })
            .extend(  # annotate in number of rows
                {
                    'n': '(1).sum()'
                })
            .extend({'r': '(1).cumsum()'},  # compute y ranks, that we will use to compare rank changes wrt x
                order_by=[y_name, y_tie_breaker])
            .extend(
                {'l': '(1).cumsum()'},  # compute reverse y ranks, used to normalize for ties in denominator
                order_by=[y_name, y_tie_breaker],
                reverse=[y_name, y_tie_breaker])
            .extend(  # get y rank and y rank of next x-item into same row so we can take a differnce
                {'rplus': 'r.shift()'},
                order_by=[x_name, x_tie_breaker],
                reverse=[x_name, x_tie_breaker])
            .extend(  # compute numerator and denominator terms
                {
                    'rdiff': '(rplus - r).abs().coalesce_0()',
                    'lterm': 'l * (n - l)',
                })
            .project(  # aggregate to compute sums in xicor definition
                {
                    'num_sum': 'rdiff.sum()',
                    'den_sum': 'lterm.sum()',
                    'n': 'n.max()',  # pseudo-aggregation, column is constant
                }
                )
            .extend({'xicor': '1 - n * num_sum / ( 2 * den_sum)'})  #  actual xicor formula
            .select_columns('xicor')
    )
    return ops


def xicor(x, y):
    """
    Compute xicor of y treated as a function of x.

    :param x: vector of explanatory variable values.
    :param y: vector of dependent variable values.
    :return: xicor score (floating point number).
    """

    ops = xicor_query(x_name='x', y_name='y')
    res_frame = ops.transform(pd.DataFrame({'x': x, 'y': y}))
    return res_frame['xicor'].values[0]


In [4]:
xicor([1, 2, 3], [1, 2, 3])

0.25

In [5]:
xicor([1, 2, 3], [3, 2, 1])

0.25

In [6]:
xicor([1, 2, 3], [1, 3, 2])

-0.125