# Calculate Percentiles

In [1]:
import json, jupyterImport
jupyterImport.enableJupyterImports()
from Utilities import find, genSamples, genSample, pluck, copyKeys, allWith, finalWrite
from merge_scrapes import comments, reports, offerings, groupReplace

importing Jupyter notebook from Utilities.ipynb
importing Jupyter notebook from merge_scrapes.ipynb
importing Jupyter notebook from parse_q.ipynb
importing Jupyter notebook from q_map.ipynb
importing Jupyter notebook from Utilities.ipynb
importing Jupyter notebook from my_harvard_map.ipynb
importing Jupyter notebook from load_raw_data.ipynb



We'll have percentiles for each course's similar courses (department + size), `primaryDepartment` and all other `departments`, its size class, and across all courses. We'll need to compile these lists of numbers first, starting with the department ones.

We use both primary departments that map directly to a group (Economics -> ECON) and bigger ones like General Education. To do this we build up a dictionary that maps departments to reports within them.

First issue: Q reports don't know their department, just their group. We'll take care of that with `deptNamesByGroup`.

In [3]:
def deptNamesByGroup(offerings, names={}):
  for offering, dept in allWith('primaryDepartment', offerings):
    names[offering['group']] = dept
  return names

 Now we map every Q report to the primary department that corresponds to its `group`.

In [4]:
from collections import defaultdict
def reportsByDept(deptNamesByGroup, reports):
  reportsByDept = defaultdict(dict)
  groups = {}
  for report, group in allWith('group', reports):
    group = groupReplace.sub(report['group'])
    groups[group] = True
    if group in deptNamesByGroup:
      reportsByDept[deptNamesByGroup[group]][report['reportId']] = report
  return reportsByDept

Reports don't know which departments they're in. But offerings do:

In [5]:
def addOfferingReportDeptNames(offerings, reportsByDept):
  for offering, topReport in allWith('topReport', offerings):
    if 'reports' in offering:
      reports = [topReport] + offering['reports']
    else:
      reports = [topReport]
    for dept in offering['departments']:
      for report in [r for r in reports if r['reportId'] not in reportsByDept[dept]]:
        reportsByDept[dept][report['reportId']] = report

Now we sort each of these lists of reports by `size`:

In [6]:
from operator import itemgetter
def deptNamesListSortedBySize(reportsByDept):
  repListByDept = {}
  for dept, repDict in reportsByDept.iteritems():
    repListByDept[dept] = sorted(repDict.values(), key=itemgetter('size'))
  return repListByDept

In [7]:
reportsByDepartment = reportsByDept(deptNamesByGroup(offerings), reports)
addOfferingReportDeptNames(offerings, reportsByDepartment)
reportsByDepartment = deptNamesListSortedBySize(reportsByDepartment)

Now we'll build up percentile lists based on the number of enrollments:

In [10]:
def sizeClass(size):
  size = int(size)
  if size >= 200:
    return '200+'
  elif size >= 100:
      return '100-199'
  elif size >= 40:
      return '40-99'
  elif size >= 20:
      return '20-39'
  elif size >= 10:
      return '10-19'
  elif size > 5:
      return '6-9'
  else:
      return '1-5'

def repListBySize(reports):    
  bySize = defaultdict(list)
  for report in reports:
    bySize[sizeClass(report['size'])].append(report)
  return bySize

In [11]:
reportsBySize = repListBySize(reports)

The last list we'll need is the main percentile: "similar courses." If we have a report for COMPSCI 50, we'll find the 50 most relevant Q reports as determined below. The formula considers size, title, group, number of professors, and workload.

In [None]:
### Takes ~30 seconds
similarReportsByReportId = genSimilarReportsByReportId(offerings)

We'll need to find percentiles for each attribute in `responses`, as well as the standard attributes in `profs/[]/responses`. We do this with binary search, essentially. For each list of reports we've generated (by size, department, etc.), and for each attribute (overall, workload, etc.), we'll create a sorted list of those values to search through.

In [12]:
profResponseQs = ['lectures', 'instructor', 'enthusiasm', 'turnaround', 'feedback', 'discussion', 'accessible']
offeringResponseQs = ['feedback', 'section', 'recommend', 'assignments', 'overall', 'workload', 'materials']

In [36]:
def sortedFor(keyPath, reportLists):
  sortedForKeyPath = {}
  for category, reports in reportLists:
    if '[]' in keyPath:
      keyPathValues = []
      for report, _ in allWith('profs', reports):
        for value in find(keyPath, obj=report, cast=float):
          keyPathValues.append(value)
      sortedByKeyPathValue = sorted(keyPathValues)
    else:
      sortedByKeyPathValue = sorted([find(keyPath, obj=r, cast=float) for r in reports])
    sortedForKeyPath[category] = [n for n in sortedByKeyPathValue if n]
  return sortedForKeyPath

def preSortPercentiles(reportLists, printProgress=False):
  sortedReportLists = {}
  for attribute in offeringResponseQs:
    sortedReportLists[attribute] = sortedFor('responses/' + attribute + '/score', reportLists)
    if printProgress:
      print 'Done sorting for attribute', attribute
  for attribute in profResponseQs:
    sortedReportLists[attribute] = sortedFor('profs/[]/responses/' + attribute + '/score', reportLists)
    if printProgress:
      print 'Done sorting for attribute', attribute
  return sortedReportLists

In [42]:
### This takes ~30 seconds, change printProgress to 'True' to see progress
reportLists = []
for repList in [reportsBySize, reportsByDepartment]:
  for category, reports in repList.items():
    reportLists.append((category, reports))
reportLists.append(('all', reports))
sortedScoreLists = preSortPercentiles(reportLists, printProgress=True)

Done sorting for attribute feedback
Done sorting for attribute section
Done sorting for attribute recommend
Done sorting for attribute assignments
Done sorting for attribute overall
Done sorting for attribute workload
Done sorting for attribute materials
Done sorting for attribute lectures
Done sorting for attribute instructor
Done sorting for attribute enthusiasm
Done sorting for attribute turnaround
Done sorting for attribute feedback
Done sorting for attribute discussion
Done sorting for attribute accessible


Now that we've got these lists all pre-sorted, we can use them to quickly calculate percentiles, and to fit the distributions we'll use to display instead of sending down all of this data to the client like we used to. First are the percentiles:

In [47]:
def addPercentiles(report, offering):
  offeringAttributes = [(a, 'responses/' + a) for a in offeringResponseQs]
  profAttributes = lambda numProfs: [(a, 'profs/[' + str(i) + ']/responses/' + a) for i in range(0, numProfs) for a in profResponseQs]
  attributes = offeringAttributes + profAttributes(len(report['profs'])) if 'profs' in report else offeringAttributes
  for attribute, keyPath in attributes:
    score = find(keyPath + '/score', obj=report, cast=float)
    numLists = sortedScoreLists[attribute]
    if score:
      percentiles = {
        'size': percentileOfValue(score, numLists[sizeClass(report['size'])]),
        'all': percentileOfValue(score, numLists['all']),
#         'similar': percentileOfValue(score, numLists[report['reportId']])
      }
      for dept in [offering['primaryDepartment']] + offering['departments']:
        percentiles[dept] = percentileOfValue(score, numLists[dept])
      find(keyPath + '/percentiles', obj=report, writeVal=percentiles)
      
def calculatePercentiles():
  for offering, topReport in allWith('topReport', offerings):
    addPercentiles(topReport, offering)
    if 'reports' in offering:
      for report in offering['reports']:
        addPercentiles(report, offering)

In [48]:
calculatePercentiles()
offeringsWithPercentiles = offerings

In [49]:
### Run to inspect
similarSum = 0
similarCount = 0
for offering, _reports in genSamples(allWith('reports', offerings), 1000, includeIndices=False):
  for report, responses in allWith('responses', _reports):
    for response, percentiles in allWith('percentiles', responses.values()):
      if 'similar' in percentiles:
        similarSum += percentiles['similar']
        similarCount += 1
    print report['group'], report['number'], report['title'], 'id:', report['reportId']
    print json.dumps(responses, indent=2)
    break

WOMGEN 98F Tutorial - Junior Year id: 40347
{
  "workload": {
    "score": "8.50", 
    "percentiles": {
      "all": 88, 
      "Women, Gender and Sexuality": 91, 
      "size": 78
    }
  }, 
  "feedback": {
    "score": "2.00", 
    "percentiles": {
      "all": 0, 
      "Women, Gender and Sexuality": 0, 
      "size": 1
    }
  }, 
  "section": {
    "score": "2.00", 
    "percentiles": {
      "all": 0, 
      "Women, Gender and Sexuality": 1, 
      "size": 1
    }
  }, 
  "overall": {
    "score": "3.00", 
    "percentiles": {
      "all": 0, 
      "Women, Gender and Sexuality": 1, 
      "size": 2
    }
  }, 
  "assignments": {
    "score": "2.50", 
    "percentiles": {
      "all": 0, 
      "Women, Gender and Sexuality": 0, 
      "size": 1
    }
  }, 
  "materials": {
    "score": "3.50", 
    "percentiles": {
      "all": 0, 
      "Women, Gender and Sexuality": 3, 
      "size": 8
    }
  }, 
  "recommend": {
    "score": "3.00", 
    "percentiles": {
      "all": 0, 
  