# Improving tests for statistics

In [None]:
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RankNTypes #-}
import Control.Monad
import Control.Monad.Primitive

import qualified Data.Vector.Generic as G
import qualified Data.Vector.Unboxed as U

import Numeric.SpecFunctions
import Numeric.MathFunctions.Constants (m_epsilon)
import Numeric.MathFunctions.Comparison

import Statistics.Distribution
import Statistics.Distribution.Beta
import Statistics.Distribution.ChiSquared
import Statistics.Distribution.Gamma
import Statistics.Distribution.StudentT
import Statistics.Distribution.FDistribution

import IHaskell.Display
import Graphics.Rendering.Chart.Backend.Cairo
import Graphics.Rendering.Chart.Easy

:l NB/Plot

In [None]:
roudtripQCDF :: ContDistr d => d -> Double -> Double
roudtripQCDF d p
  = p'
  where
    x  = quantile   d p
    p' = cumulative d x

roundtripError :: ContDistr d => d -> Double -> Double
roundtripError d p
  = m_epsilon + abs ((x / p) * density d x * m_epsilon)
  where
    x = quantile d p

In [None]:
plotErrorEstimate d
  = toRenderable
  $ layout_title .~ "Error estimate"
  $ plotFunctions [ \p -> logBase 10 $ relativeError p (roudtripQCDF d p)
                  , logBase 10 . roundtripError d
                  ] (0,1)

plotErrorEstimateLog d
  = toRenderable
  $ layout_title .~ "Error estimate"
  $ plotFunctionsLog [ \p -> logBase 10 $ relativeError p (roudtripQCDF d p)
                     , logBase 10 . roundtripError d
                     ] (1e-10,1)

Functions for roundtrip `cumulative . quantile` and estimating error.

# Investigate beta distribution

In [None]:
-- See math-functions#35
badBeta = betaDistr 7 7e-2

-- See math-functions#36
-- badBeta = betaDistr 4.5 4.5

-- Simply poor precision
-- badBeta = betaDistr 0.1711087256012734 5.608055284855086

toRenderable
  $ plotFunctions [density badBeta] (0,1)
plotErrorEstimate    badBeta
plotErrorEstimateLog badBeta

# Chi squared

In [None]:
-- Overall bad precision (even shape is wrong)
badChi = chiSquared 60

toRenderable
  $ plotFunctions [density badChi] (0,2*mean badChi)
plotErrorEstimate    badChi
plotErrorEstimateLog badChi

# Gamma distribution

In [None]:
badGamma = gammaDistr 9.671961397067255  9.41872452467439

toRenderable
  $ plotFunctions [density badGamma] (0,2*mean badGamma)
plotErrorEstimate    badGamma
plotErrorEstimateLog badGamma

# StudentT

In [None]:
-- N.B. horrible loss of precision for NDF=30!
badStudentT = studentT 30

toRenderable
  $ plotFunctions [density badStudentT] (-10,10)
plotErrorEstimate    badStudentT
plotErrorEstimateLog badStudentT

Let ignore horrible peak near 0. Maybe it will go away after issue with bad initial guess for inverse beta will resolve.

So why does Student T performs so poorly. Notice peak near `p=0.5`. Let look at the code. `cumulative` doesn't look too bad.

```.haskell
cumulative :: StudentT -> Double -> Double
cumulative (StudentT ndf) x
  | x > 0     = 1 - 0.5 * ibeta
  | otherwise = 0.5 * ibeta
  where
    ibeta = incompleteBeta (0.5 * ndf) 0.5 (ndf / (ndf + x*x))
```

But quantile... Just look at all these `1-p`s!

```
quantile :: StudentT -> Double -> Double
quantile (StudentT ndf) p
  | p >= 0 && p <= 1 =
    let x = invIncompleteBeta (0.5 * ndf) 0.5 (2 * min p (1 - p))
    in case sqrt $ ndf * (1 - x) / x of
         r | p < 0.5   -> -r
           | otherwise -> r
  | otherwise = modErr "quantile" $ "p must be in [0,1] range. Got: "++show p
```

Here we try to exploit that `quantile d 0.5 = 0` and lose about 2 significant digints in process.