In [None]:
from scipy.stats import johnsonsu, moment, norm, tmean, tvar, skew
import matplotlib.pyplot as plt
import numpy as np
from random import gauss
from math import floor, log10
from decimal import Decimal

def cacheDist(mean, var, skewness, johnson, cachedDists):
  meanKey, varKey = '{0:0.2f}'.format(float(mean)), '{0:0.2f}'.format(float(var))
  cachedDists[meanKey][varKey].append({'skewness': skewness, 'johnson': johnson})
def cachedDist(mean, var, skewness, cachedDists):
  meanKey, varKey = '{0:0.2f}'.format(float(mean)), '{0:0.2f}'.format(float(var))
  if meanKey in cachedDists and varKey in cachedDists[meanKey]:
    for entry in cachedDists[meanKey][varKey]:
      if abs(skewness - entry['skewness']) < 0.25:
        return entry['johnson']

In [None]:
from datetime import datetime

def buildJohnsonDict(johnsonDict, cachedDists, start=0):      
  cacheHits = 0.0
  totalHits = 0.0
  lookups = 0.01
  elapsed = datetime.now()
  xLists = []
  topReports = {}
  for attribute, listDict in sortedReportLists.iteritems():
    for listKey, xs in listDict.iteritems():
      xLists.append((listKey, attribute, xs))
  print 'Starting at', start, 'just', len(xLists) - start, 'to go!'
  for listKey, attribute, xs in xLists[start:]:
    if len(xs) == 0:
      print 'Len 0 for', attribute, listKey
      continue
    if int(lookups) % 100 == 0:
      print int(cacheHits), 'cache hits in last 100. Hit cache', int(totalHits), 'out of', int(lookups), 'times', '(' + str(int(100. * totalHits / lookups)) + '%)', 'took', int(round((datetime.now() - elapsed).total_seconds())), 'seconds'
      cacheHits = 0
      elapsed = datetime.now()
    lookups += 1
    [mean, var, skewness] = [tmean(xs), tvar(xs), skew(xs)]
    cached = cachedDist(mean, var, skewness, cachedDists)
    if cached:
      cached['N'] = len(xs)
      johnsonDict[attribute][listKey] = cached
      cacheHits += 1
      totalHits += 1
    else:
      [a, b, loc, scale] = [('%.2E' % Decimal(x)).replace('E+00', '') for x in johnsonsu.fit(xs)]
      johnsonParams = {'a': a, 'b': b, 'loc': loc, 'scale': scale}
      cacheDist(mean, var, skewness, johnsonParams, cachedDists)
      johnsonParams['N'] = len(xs)
      johnsonDict[attribute][listKey] = johnsonParams

In [None]:
try:
  johnsonDict = json.loads(open('../static/commonDistributions.json').read())
except:
  johnsonDict = defaultdict(dict)
cachedDists = defaultdict(lambda: defaultdict(list))

In [None]:
### Only run if johsonDict isn't already built. Takes forever, but you can pause / resume
allBuilt = 0
for key, dct in johnsonDict.iteritems():
  allBuilt += len(dct.keys())
buildJohnsonDict(johnsonDict, cachedDists, start=allBuilt)

In [None]:
### Only run to update commonDistributions if it's been rebuilt
with open('../static/commonDistributions.json', 'w') as distFile:
    distFile.write(json.dumps(johnsonDict, indent=2))

In [None]:
import time
def linkJohnsonParams():
  offeringAttributes = [(a, 'responses/' + a) for a in offeringResponseQs]
  profAttributes = lambda numProfs: [(a, 'profs/[' + str(i) + ']/responses/' + a) for i in range(0, numProfs) for a in profResponseQs]
  for offering, reports in allWith('reports', offerings):
    for report in reports:
      attributes = offeringAttributes + profAttributes(len(report['profs'])) if 'profs' in report else offeringAttributes
      for attribute, keyPath in attributes:
        print 'attribute', attribute, 'keypath', keyPath
        time.sleep(1)
        try:
          params = johnsonDict[attribute][report['reportId']]
        except:
#           print 'No params for', attribute, report['group'], report['title']
          continue
        write(keyPath + '/percentiles/distParams', obj=report, writeVal=params)

In [None]:
linkJohnsonParams()