Skip to content

Commit

Permalink
Merge pull request #29 from XENONnT/mybinder
Browse files Browse the repository at this point in the history
Example Notebook
  • Loading branch information
hammannr committed Dec 28, 2021
2 parents be93136 + f98c22b commit f1a3d87
Show file tree
Hide file tree
Showing 13 changed files with 732 additions and 57 deletions.
21 changes: 17 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,26 @@ jobs:
- name: Checkout repo
uses: actions/checkout@v2
- name: Install dependencies
uses: py-actions/py-dependency-install@v2
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest coverage coveralls
pip install -r requirements.txt
- name: Install GOFevaluation
run: |
pip install .
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
flake8 . --count --select=F5,F6,F7,F8,F9,E1,E2,E3,E5,E7,E9,W291 --max-line-length=100 --show-source --statistics
- name: Test with pytest
run: |
pytest
- name: Coveralls
# Make the coverage report and upload
env:
NUMBA_DISABLE_JIT: 1
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
coverage run --source=GOFevaluation setup.py test -v
coveralls --service=github
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ build
dist
__pycache__
.eggs
gofevaluation_tutorial.py
7 changes: 4 additions & 3 deletions GOFevaluation/evaluator_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
import scipy.stats as sps
import warnings
from GOFevaluation import equiprobable_histogram, apply_irregular_binning, plot_equiprobable_histogram, check_sample_sanity
from GOFevaluation import (equiprobable_histogram, apply_irregular_binning,
plot_equiprobable_histogram, check_sample_sanity)


class EvaluatorBase(object):
Expand All @@ -19,9 +20,9 @@ def __repr__(self):

def __str__(self):
args = [self._name]
if self.gof:
if self.gof is not None:
args.append(f'gof = {self.gof}')
if self.pvalue:
if self.pvalue is not None:
args.append(f'p-value = {self.pvalue}')
args_str = "\n".join(args)
return f'{self.__class__.__module__}\n{args_str}'
Expand Down
4 changes: 2 additions & 2 deletions GOFevaluation/evaluators_nd.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class BinnedChi2GOF(EvaluatorBaseBinned):
initialise with .bin_equiprobable(...)
:param data_sample: sample of unbinned data
:type data_sample: array_like, n-Dimensional
:param reference_sample: sample of unbinned reference
:param reference_sample: sample of unbinned reference
(should have >50 samples than the data sample so that
statistical fluctuations are negligible.)
:type reference_sample: array_like, n-Dimensional
Expand Down Expand Up @@ -178,7 +178,7 @@ class PointToPointGOF(EvaluatorBaseSample):
:type data_sample: array_like, n-Dimensional
:param reference_sample: sample of unbinned reference
:type reference_sample: array_like, n-Dimensional
:param w_func: weighting function to use for the GOF measure.
:param w_func: weighting function to use for the GOF measure.
Defaults to 'log'. Other options are:
'x2', 'x', '1/x'
:type w_func' str, optional
Expand Down
17 changes: 10 additions & 7 deletions GOFevaluation/gof_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ def __init__(self, gof_list, **kwargs):
for gof_str in self.gof_list:
if gof_str in self.allowed_gof_str:
func = eval(gof_str)
self.gofs[gof_str] = None
self.pvalues[gof_str] = None
else:
allowed_str = ", ".join(
['"' + str(p) + '"' for p in self.allowed_gof_str])
Expand All @@ -71,13 +73,13 @@ def __repr__(self):
return f'{self.__class__.__module__}:\n{self.__dict__}'

def __str__(self):
args = ['GoF measures: ' + ", ".join(self.gof_list)]
if self.gofs:
gofs_str = ", ".join([str(g) for g in self.gofs.values()])
args.append('gofs = ' + gofs_str)
if self.pvalues:
pvalues_str = ", ".join([str(p) for p in self.pvalues.values()])
args.append('p-values = ' + pvalues_str)
args = ['GOF measures: ' + ", ".join(self.gof_list)] + ['\n']
for key, gof in self.gofs.items():
pval = self.pvalues[key]
results_str = '\033[1m' + key + '\033[0m' + '\n'
results_str += f'gof = {gof}\n'
results_str += f'p-value = {pval}\n'
args.append(results_str)
args_str = "\n".join(args)
return f'{self.__class__.__module__}\n{args_str}'

Expand Down Expand Up @@ -134,6 +136,7 @@ def get_pvalues(self, **kwargs):
specific_kwargs = self._get_specific_kwargs(func, kwargs)
pvalue = func(**specific_kwargs)
self.pvalues[key] = pvalue
self.gofs[key] = self.gof_objects[key].gof
kwargs_used += specific_kwargs.keys()
self._check_kwargs_used(kwargs_used, kwargs)
return self.pvalues
6 changes: 3 additions & 3 deletions GOFevaluation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ def _get_equiprobable_binning(reference_sample, n_partitions, order=None):
[1, 0] : first bin y then bin x for each partition in y
if None, the natural order, i.e. [0, 1] is used. For 1D just put None.
:type order: list, optional
:return: Returns bin_edges.
:return: Returns bin_edges.
1D: list of bin edges
2D: For order [0, 1]([1, 0]) these are the bin edges in x(y) and y(x)
respectively. bin_edges[1] is a list of bin edges corresponding to the
2D: For order [0, 1]([1, 0]) these are the bin edges in x(y) and y(x)
respectively. bin_edges[1] is a list of bin edges corresponding to the
partitions defined in bin_edges[0].
:rtype: list of arrays
:raises ValueError: when an unknown order is passed.
Expand Down
6 changes: 5 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v0.1.0
===================
* Release as a python package
* Multiple GOF tests (binned and unbinned) can be performed (#1, #5, #10, #12, #13)
* The p-value is calculated based on toy sampling from the reference or a permutation test (#2, #14)
* A wrapper class makes it convenient to perform multiple GOF tests in parallel (#19)
* An equiprobable binning algorithm is implemented. The binning can be applied upon initialisation of the GOF object and a few visualization tools are provided. (#25, #26)
* CI workflow implemented (#7)
36 changes: 29 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# GOFevaluation
Evaluate the Goodness-of-Fit (GoF) for binned or unbinned data.
![Test package](https://github.com/XENONnT/GOFevaluation/actions/workflows/python-package.yml/badge.svg)
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/XENONnT/GOFevaluation/HEAD)
[![PyPI version shields.io](https://img.shields.io/pypi/v/GOFevaluation.svg)](https://pypi.python.org/pypi/GOFevaluation/)
[![CodeFactor](https://www.codefactor.io/repository/github/xenonnt/gofevaluation/badge)](https://www.codefactor.io/repository/github/xenonnt/gofevaluation)
[![Coverage Status](https://coveralls.io/repos/github/XENONnT/GOFevaluation/badge.svg?branch=master)](https://coveralls.io/github/XENONnT/GOFevaluation?branch=master)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5626909.svg)](https://doi.org/10.5281/zenodo.5626909)

This GoF suite comprises the possibility to calculate different 1D / nD, binned / two-sample (unbinned) GoF measures and the corresponding p-value. A list of implemented measures is given below.
Expand All @@ -19,6 +23,12 @@ This GoF suite comprises the possibility to calculate different 1D / nD, binned

## Installation and Set-Up

### Regular installation:
```
pip install GOFevaluation
```

### Developer setup:
Clone the repository:

```
Expand All @@ -37,6 +47,7 @@ python setup.py install --user
You are now good to go!

## Usage
The best way to start with the `GOFevaluation` package is to have a look at the tutorial notebook. If you click on the [mybinder](https://mybinder.org/v2/gh/XENONnT/GOFevaluation/HEAD) badge, you can execute the interactive notebook and give it a try yourself without the need of a local installation.
### Individual GoF Measures
Depending on your data and reference input you can initialise a `gof_object` in one of the following ways:
```python
Expand Down Expand Up @@ -84,27 +95,38 @@ gof_object.get_gofs(d_min=d_min)
# OUTPUT:
# OrderedDict([('ADTestTwoSampleGOF', 1.6301454042304904),
# ('KSTestTwoSampleGOF', 0.14),
# ('PointToPointGOF', 0.00048491049630050576)])
# ('PointToPointGOF', -0.7324060759792504)])

gof_object.get_pvalues(d_min=d_min)
# OUTPUT:
# OrderedDict([('ADTestTwoSampleGOF', 0.08699999999999997),
# ('KSTestTwoSampleGOF', 0.10699999999999998),
# ('PointToPointGOF', 0.14300000000000002)])
# ('PointToPointGOF', 0.31200000000000006)])

# Re-calculate p-value only for one measure:
gof_object.get_pvalues(d_min=.3, gof_list=['PointToPointGOF'])
gof_object.get_pvalues(d_min=.001, gof_list=['PointToPointGOF'])
# OUTPUT:
# OrderedDict([('ADTestTwoSampleGOF', 0.08699999999999997),
# ('KSTestTwoSampleGOF', 0.10699999999999998),
# ('PointToPointGOF', 0.03400000000000003)])
# ('PointToPointGOF', 0.128)])

print(gof_object)
# OUTPUT:
# GOFevaluation.gof_test
# GoF measures: ADTestTwoSampleGOF, KSTestTwoSampleGOF, PointToPointGOF
# gofs = 1.6301454042304904, 0.14, 0.00048491049630050576
# p-values = 0.08699999999999997, 0.10699999999999998, 0.03400000000000003
# GOF measures: ADTestTwoSampleGOF, KSTestTwoSampleGOF, PointToPointGOF


# ADTestTwoSampleGOF
# gof = 1.6301454042304904
# p-value = 0.08499999999999996

# KSTestTwoSampleGOF
# gof = 0.13999999999999996
# p-value = 0.09799999999999998

# PointToPointGOF
# gof = -0.7324060759792504
# p-value = 0.128
```


Expand Down
633 changes: 633 additions & 0 deletions gofevaluation_tutorial.ipynb

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,3 @@ numpy
scipy
sklearn
matplotlib
flake8
pytest
8 changes: 4 additions & 4 deletions tests/test_evaluators_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_value(self):
kind='cubic')

# Calculate GoF 'by hand'
ecdf = np.arange(n_samples+1, dtype=float)/n_samples
ecdf = np.arange(n_samples + 1, dtype=float) / n_samples
dn = np.abs(interp_cdf(np.sort(data)) - ecdf[:-1])

# Calculate GoF
Expand Down Expand Up @@ -104,9 +104,9 @@ class TestPvalues(unittest.TestCase):
def test_two_sample_value(self):
"""Test if p-value for two identical samples is 1."""
# Fixed Standard Normal distributed data
data = np.array([-0.80719796, 0.39138662, 0.12886947, -0.4383365,
0.88404481, 0.98167819, 1.22302837, 0.1138414,
0.45974904, 0.48926863])
data = np.array([-0.80719796, 0.39138662, 0.12886947, -0.4383365,
0.88404481, 0.98167819, 1.22302837, 0.1138414,
0.45974904, 0.48926863])

gof_objects = [ADTestTwoSampleGOF(data, data),
KSTestTwoSampleGOF(data, data),
Expand Down
46 changes: 23 additions & 23 deletions tests/test_evaluators_nd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
class TestBinnedPoissonChi2GOF(unittest.TestCase):
def test_dimensions(self):
# test nD binned GOF in different dimensions:
for nD in range(2, 5+1):
for nD in range(2, 5 + 1):
# generate uniformly distributed data points and bin data
n_events_per_bin = 30
n_bins_per_dim = int(32**(1/nD))
n_bins_per_dim = int(32**(1 / nD))
n_events = int(n_bins_per_dim**nD * n_events_per_bin)

data_points = sps.uniform().rvs(size=[n_events, nD])
bin_edges = np.linspace(0, 1, n_bins_per_dim+1)
bin_edges = np.linspace(0, 1, n_bins_per_dim + 1)
bin_edges = np.array([bin_edges for i in range(nD)])
binned_data, _ = np.histogramdd(data_points, bins=bin_edges)
binned_data_flat = binned_data.reshape(-1)
Expand All @@ -42,14 +42,14 @@ def test_dimensions(self):

def test_from_binned(self):
"""Test if regular init and from_binned init give same result"""
for nD in range(1, 5+1):
for nD in range(1, 5 + 1):
# generate uniformly distributed data points and bin data
n_events_per_bin = 30
n_bins_per_dim = int(32**(1/nD))
n_bins_per_dim = int(32**(1 / nD))
n_events = int(n_bins_per_dim**nD * n_events_per_bin)

data_points = sps.uniform().rvs(size=[n_events, nD])
bin_edges = np.linspace(0, 1, n_bins_per_dim+1)
bin_edges = np.linspace(0, 1, n_bins_per_dim + 1)
bin_edges = np.array([bin_edges for i in range(nD)])
binned_data, _ = np.histogramdd(data_points, bins=bin_edges)

Expand Down Expand Up @@ -118,7 +118,7 @@ def test_distances(self):
data, reference)

self.assertEqual(len(d_data_data), nevents_data *
(nevents_data-1) / 2)
(nevents_data - 1) / 2)
self.assertEqual(len(d_data_ref), nevents_ref *
nevents_data)

Expand All @@ -141,7 +141,7 @@ def test_value(self):
xs_a = np.array([0])[:, None]
xs_b = np.array([1, 2])[:, None]

e_data_ref = np.log(2)/2
e_data_ref = np.log(2) / 2
gofclass_ab = PointToPointGOF(xs_a, xs_b)
# set d_min explicitly to avoid asymmetry in setting d_min
gof_ab = gofclass_ab.get_gof(d_min=0.01)
Expand All @@ -152,14 +152,14 @@ class TestBinnedChi2GOF(unittest.TestCase):

def test_dimensions(self):
# test nD binned GOF in different dimensions:
for nD in range(2, 5+1):
for nD in range(2, 5 + 1):
# generate uniformly distributed data points and bin data
n_events_per_bin = 30
n_bins_per_dim = int(32**(1/nD))
n_bins_per_dim = int(32**(1 / nD))
n_events = int(n_bins_per_dim**nD * n_events_per_bin)

data_points = sps.uniform().rvs(size=[n_events, nD])
bin_edges = np.linspace(0, 1, n_bins_per_dim+1)
bin_edges = np.linspace(0, 1, n_bins_per_dim + 1)
bin_edges = np.array([bin_edges for i in range(nD)])
binned_data, _ = np.histogramdd(data_points, bins=bin_edges)
binned_data_flat = binned_data.reshape(-1)
Expand Down Expand Up @@ -187,22 +187,22 @@ def test_chi2_distribution(self):

n_testvalues = 100
model = sps.uniform()
for nD in range(1, 5+1):
for nD in range(1, 5 + 1):
# have same number of events per bin and total number
# of bins for all tests
n_events_per_bin = 20
n_bins_per_dim = int(32**(1/nD))
n_bins_per_dim = int(32**(1 / nD))
n_events = int(n_bins_per_dim**nD * n_events_per_bin)

bin_edges = np.linspace(0, 1, n_bins_per_dim+1)
bin_edges = np.linspace(0, 1, n_bins_per_dim + 1)
bin_edges = np.array([bin_edges for i in range(nD)])

chi2_vals = []
for i in range(n_testvalues):
# generate uniformly distributed rvs with fixed random
# states for reproducibility
data_points = model.rvs(
size=[n_events, nD], random_state=300+i)
size=[n_events, nD], random_state=300 + i)
binned_data, _ = np.histogramdd(data_points, bins=bin_edges)

normed_pdf = np.ones(binned_data.shape)
Expand All @@ -228,19 +228,19 @@ def test_chi2_distribution(self):

# calculate 'reduced chi2' to estimate agreement of chi2 values
# and chi2 pdf
test_chi2 = np.sum((chi2_pdf-n)**2 / chi2_pdf)/n_chi2_bins
self.assertTrue((test_chi2 > 1/3) & (test_chi2 < 3))
test_chi2 = np.sum((chi2_pdf - n)**2 / chi2_pdf) / n_chi2_bins
self.assertTrue((test_chi2 > 1 / 3) & (test_chi2 < 3))

def test_from_binned(self):
"""Test if regular init and from_binned init give same result"""
for nD in range(1, 5+1):
for nD in range(1, 5 + 1):
# generate uniformly distributed data points and bin data
n_events_per_bin = 15
n_bins_per_dim = int(32**(1/nD))
n_bins_per_dim = int(32**(1 / nD))
n_events = int(n_bins_per_dim**nD * n_events_per_bin)

data_points = sps.uniform().rvs(size=[n_events, nD])
bin_edges = np.linspace(0, 1, n_bins_per_dim+1)
bin_edges = np.linspace(0, 1, n_bins_per_dim + 1)
bin_edges = np.array([bin_edges for i in range(nD)])
binned_data, _ = np.histogramdd(data_points, bins=bin_edges)

Expand Down Expand Up @@ -299,9 +299,9 @@ def test_dimension_two_sample(self):
d_min = .00001
for nD in [2, 3, 4]:
# Fixed Standard Normal distributed data
data = np.array([-0.80719796, 0.39138662, 0.12886947, -0.4383365,
0.88404481, 0.98167819, 1.22302837, 0.1138414,
0.45974904, 0.48926863])
data = np.array([-0.80719796, 0.39138662, 0.12886947, -0.4383365,
0.88404481, 0.98167819, 1.22302837, 0.1138414,
0.45974904, 0.48926863])
data = np.vstack([data for i in range(nD)]).T
gof_object = PointToPointGOF(data, data)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_gof_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_gof(self):
model = sps.uniform
nevents_expected = 300
data_sample = model.rvs(size=nevents_expected)
reference_sample = model.rvs(size=nevents_expected*3)
reference_sample = model.rvs(size=nevents_expected * 3)
bin_edges = np.linspace(0, 1, 11)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
binned_data, _ = np.histogram(data_sample, bins=bin_edges)
Expand Down

0 comments on commit f1a3d87

Please sign in to comment.