# Calculate Percentiles

In [1]:
import json, jupyterImport
jupyterImport.enableJupyterImports()
from Utilities import find, genSamples, genSample, pluck, copyKeys, allWith, finalWrite, percentileOfValue, indexOfClosest, analyze
from merge_scrapes import comments, reports, offerings, groupReplace
from collections import defaultdict

We group Q reports by `size` and `department`.

Each Q report includes `size`, but no `department` information.

Current `offerings` from my.harvard include `department`. Grouping these reports is easy:

In [2]:
def groupOfferingReportsByDepartment(offerings, reports):
  reportsByDepartment = defaultdict(dict)
  for offering, topReport in allWith('topReport', offerings):
    offeringReports = [topReport] + (offering['reports'] if 'reports' in offering else [])
    for department in offering['departments']:
      for report in offeringReports:
        reportsByDepartment[department][report['reportId']] = report
  return reportsByDepartment

For old `reports`, we don't know `department`. But we know `group`. And each offering has a `group` that maps to a `primaryDepartment`. We use this to group the old reports:

In [3]:
def groupOrphanReportsByDepartment(offerings, reports, reportsByDepartment):
  departmentNames = {}
  for offering in offerings:
    departmentNames[offering['group']] = offering['departments'][0]
  for report, group in allWith('group', reports):
    sanitized = groupReplace.sub(group)
    department = departmentNames[sanitized] if sanitized in departmentNames else None
    if department:
      reportsByDepartment[department][report['reportId']] = report
def convertDictsToLists(reportDicts):
  reportGroups = {}
  for department, group in reportDicts.iteritems():
    reportGroups[department] = group.values()
  return reportGroups

In [4]:
reportDictsByDepartment = groupOfferingReportsByDepartment(offerings, reports)
groupOrphanReportsByDepartment(offerings, reports, reportDictsByDepartment)
reportsByDepartment = convertDictsToLists(reportDictsByDepartment)

Now we'll group reports by `size`.

In [5]:
def sizeClass(size):
  size = int(size)
  if size >= 200:
    return '200+'
  elif size >= 100:
      return '100-199'
  elif size >= 40:
      return '40-99'
  elif size >= 20:
      return '20-39'
  elif size >= 10:
      return '10-19'
  elif size > 5:
      return '6-9'
  else:
      return '1-5'

def groupReportsBySize(reports):    
  bySize = defaultdict(list)
  for report in reports:
    bySize[sizeClass(report['size'])].append(report)
  return bySize

In [6]:
reportsBySize = groupReportsBySize(reports)

Now we have reports grouped by `department` and `size`. We need to lookup percentiles for each course attribute and each professor attribute. For each attribute, we pre-compute a sorted list of scores. Then we can calculate any percentile with a quick binary search.

In [7]:
profQNames = ['lectures', 'instructor', 'enthusiasm', 'turnaround', 'feedback', 'discussion', 'accessible']
courseQNames = ['feedback', 'section', 'recommend', 'assignments', 'overall', 'workload', 'materials']

In [8]:
def sortedFor(keyPath, reportGroups):
  sortedForKeyPath = {}
  for category, reportGroup in reportGroups:
    if '[]' in keyPath:
      keyPathValues = []
      for report, _ in allWith('profs', reportGroup):
        for value in find(keyPath, obj=report, cast=float):
          keyPathValues.append(value)
      sortedByKeyPathValue = sorted(keyPathValues)
    else:
      sortedByKeyPathValue = sorted([find(keyPath, obj=report) for report in reportGroup])
    sortedForKeyPath[category] = [value for value in sortedByKeyPathValue if value]
  return sortedForKeyPath

def sortReportGroups(reportGroups):
  sortedScores = {}
  for attribute in courseQNames:
    sortedScores[attribute] = sortedFor('responses/' + attribute + '/score', reportGroups)
  for attribute in profQNames:
    sortedScores[attribute] = sortedFor('profs/[]/responses/' + attribute + '/score', reportGroups)
  return sortedScores

In [9]:
reportGroups = reportsBySize.items() + reportsByDepartment.items() + [('all', reports)]
sortedScoreLists = sortReportGroups(reportGroups)

Now that we've got these lists all pre-sorted, we can use them to quickly calculate percentiles, and to fit the distributions we'll use to display instead of sending down all of this data to the client like we used to. First are the percentiles:

In [10]:
def addPercentiles(report, offering):
  profAttributes = lambda numProfs: [(Q, 'profs/[' + str(i) + ']/responses/' + Q) for i in range(0, numProfs) for Q in profQNames]
  courseAttributes = [(Q, 'responses/' + Q) for Q in courseQNames]
  attributes = courseAttributes + profAttributes(len(report['profs'])) if 'profs' in report else courseAttributes
  for attribute, keyPath in attributes:
    score = find(keyPath + '/score', obj=report, cast=float)
    if not score:
      continue
    numLists = sortedScoreLists[attribute]
    percentiles = {
      'size': percentileOfValue(score, numLists[sizeClass(report['size'])]),
      'all': percentileOfValue(score, numLists['all']),
    }
    for dept in offering['departments']:
      percentiles[dept] = percentileOfValue(score, numLists[dept])
    find(keyPath + '/percentiles', obj=report, writeVal=percentiles)
      
def calculatePercentiles():
  for offering, topReport in allWith('topReport', offerings):
    addPercentiles(topReport, offering)
    if 'reports' in offering:
      for report in offering['reports']:
        addPercentiles(report, offering)

In [11]:
calculatePercentiles()
offeringsWithPercentiles = offerings