Professor Sourav Chatterjee's xicor coefficient of correlation (<a href="https://win-vector.com/2021/12/29/exploring-the-xi-correlation-coefficient/">Nina Zumel's tutorial</a>, <a href="https://doi.org/10.1080/01621459.2020.1758115">JASA</a>; original sources: <a href="https://CRAN.R-project.org/package=XICOR">R package</a>, <a href="https://arxiv.org/abs/1909.10140">Arxiv</a>, <a href="https://news.ycombinator.com/item?id=29687613">Hacker News</a>, and <a href="https://github.com/czbiohub/xicor">a Python package</a> (different author)).

In [5]:
from typing import Iterable
import numpy as np
import pandas as pd
from data_algebra.data_ops import descr, TableDescription, ViewRepresentation
import data_algebra.BigQuery
import yaml


In [6]:
x_y_ops = xicor_query(TableDescription(table_name='df', column_names=['x', 'y']))

def xicor(x, y):
    """
    Compute xicor of y treated as a function of x.

    :param x: vector of explanatory variable values.
    :param y: vector of dependent variable values.
    :return: xicor score (floating point number).
    """

    res_frame = x_y_ops.transform(pd.DataFrame({'x': x, 'y': y}))
    return res_frame['xicor'].values[0]


In [7]:
x1 = xicor([1, 2, 3], [1, 2, 3])  # expect 0.25
assert x1 == 0.25
x1

0.25

In [8]:
x2 = xicor([1, 2, 3], [3, 2, 1])  # expect 0.25
assert x2 == 0.25
x2

0.25

In [9]:
x3 = xicor([1, 2, 3], [1, 3, 2])  # expect -0.125
assert x3 == -0.125
x3

-0.125

In [10]:
with open("examples.yaml", "r") as in_f:
   examples = yaml.safe_load(in_f)

In [11]:
def example_to_frame(ei):
    "encode an example into a data frame"
    example = examples[ei]
    a = example['a']
    b = example['b']
    return pd.DataFrame({'x': a, 'y': b, 'vname': f'v_{ei}'})

example_frames = [example_to_frame(ei) for ei in range(len(examples))]
example_frames = pd.concat(example_frames).reset_index(drop=True, inplace=False)

example_frames

Unnamed: 0,x,y,vname
0,0.561595,0.084883,v_0
1,0.035700,0.635105,v_0
2,0.748017,0.133560,v_0
3,0.663603,0.577371,v_0
4,0.074035,0.214352,v_0
...,...,...,...
995,0.360310,5.000000,v_49
996,0.413419,2.000000,v_49
997,0.224839,4.000000,v_49
998,0.910356,10.000000,v_49


In [12]:
rep_frame = pd.DataFrame({
    'rep': range(100)
})


In [13]:
grouped_calc = (
    xicor_query(
            descr(d=example_frames)
                .natural_join(  # cross join rows to get experiment repetitions
                    b=descr(rep_frame=rep_frame),
                    by=[],
                    jointype='cross',
                ),
            var_keys=['vname', 'rep'])
        .project({
            'xicor_mean': 'xicor.mean()',
            'xicor_std': 'xicor.std()',
            },
            group_by=['vname'])
        .order_rows(['vname'])
)
xicor_results = grouped_calc.eval({'d': example_frames, 'rep_frame': rep_frame})

xicor_results


Unnamed: 0,vname,xicor_mean,xicor_std
0,v_0,-0.067669,0.0
1,v_1,-0.007519,0.0
2,v_10,-0.056827,0.101521
3,v_11,-0.011729,0.088231
4,v_12,-0.120722,0.099834
5,v_13,0.069973,0.0
6,v_14,-0.04406,0.081703
7,v_15,0.193279,0.088892
8,v_16,-0.022305,0.090496
9,v_17,-0.061305,0.11825


In [14]:
# compare results
def compare_res(xicor_results_to_check):
    for ei in range(len(examples)):
        example = examples[ei]
        a = example['a']
        b = example['b']
        ref_xicor = example['xicor']
        our_result = xicor_results_to_check.loc[xicor_results_to_check['vname'] == f'v_{ei}', :]
        our_xicor_mean = our_result['xicor_mean'].values[0]
        our_xicor_std = our_result['xicor_std'].values[0]
        assert np.abs(np.mean(ref_xicor) - our_xicor_mean) < 0.05
        assert np.abs(np.std(ref_xicor) - our_xicor_std) < 0.05
        print(f'ref: {np.mean(ref_xicor)} {np.std(ref_xicor)}, ours: {our_xicor_mean} {our_xicor_std}')

compare_res(xicor_results)

ref: -0.0676692 0.0, ours: -0.06766917293233088 0.0
ref: -0.007518800000000003 3.469446951953614e-18, ours: -0.007518796992481258 0.0
ref: 2.2204459999999992e-16 7.395570986446986e-32, ours: 0.0 0.0
ref: -0.18796989999999994 5.551115123125783e-17, ours: -0.18796992481203012 0.0
ref: 0.135514 0.0, ours: 0.13551401869158874 0.0
ref: 0.003533834180000004 0.06878326849618019, ours: -0.0009774436090225712 0.06992807851052675
ref: -0.06844740000000002 1.3877787807814457e-17, ours: -0.06844741235392315 0.0
ref: -0.12718959999999996 5.551115123125783e-17, ours: -0.12718964204112715 0.0
ref: 0.04385151299999999 0.08295654197477093, ours: 0.0279969064191802 0.08788468129913046
ref: -0.12030080000000005 5.551115123125783e-17, ours: -0.12030075187969924 0.0
ref: -0.042562927 0.11081480834838983, ours: -0.05682684973302821 0.10152062117597214
ref: -0.01849624 0.07992888705673187, ours: -0.011729323308270687 0.08823090724031592
ref: -0.12621664900000001 0.11045203602378319, ours: -0.1207221350078493

In [15]:
# try it in database
db_handle = data_algebra.BigQuery.example_handle()
db_handle.insert_table(example_frames, table_name='d', allow_overwrite=True)
db_handle.insert_table(rep_frame, table_name='rep_frame', allow_overwrite=True)

(TableDescription(table_name="rep_frame", column_names=["rep"]))

In [16]:
db_handle.drop_table("xicor")

In [17]:
db_handle.execute(f"CREATE TABLE {db_handle.db_model.table_prefix}.xicor AS {db_handle.to_sql(grouped_calc)}")
db_res = db_handle.read_query(f"SELECT * FROM {db_handle.db_model.table_prefix}.xicor ORDER BY vname")

In [18]:
compare_res(db_res)

ref: -0.0676692 0.0, ours: -0.06766917293233088 0.0
ref: -0.007518800000000003 3.469446951953614e-18, ours: -0.007518796992481258 0.0
ref: 2.2204459999999992e-16 7.395570986446986e-32, ours: 0.0 0.0
ref: -0.18796989999999994 5.551115123125783e-17, ours: -0.18796992481203012 0.0
ref: 0.135514 0.0, ours: 0.13551401869158874 0.0
ref: 0.003533834180000004 0.06878326849618019, ours: -0.016240601503759423 0.06815869787738167
ref: -0.06844740000000002 1.3877787807814457e-17, ours: -0.06844741235392315 0.0
ref: -0.12718959999999996 5.551115123125783e-17, ours: -0.12718964204112715 0.0
ref: 0.04385151299999999 0.08295654197477093, ours: 0.032018561484918794 0.07778912646483754
ref: -0.12030080000000005 5.551115123125783e-17, ours: -0.12030075187969924 0.0
ref: -0.042562927 0.11081480834838983, ours: -0.04630053394355454 0.11283142924862233
ref: -0.01849624 0.07992888705673187, ours: 0.005037593984962396 0.08714819132102225
ref: -0.12621664900000001 0.11045203602378319, ours: -0.0998430141287284

In [19]:
# clean up
db_handle.drop_table("d")
db_handle.drop_table("rep_frame")
db_handle.drop_table("xicor")
db_handle.close()
# show we made it to here, adn did not assert earlier
print('done')

done
