# Merge Scrapes
Start by importing some utilities and the raw data:

In [5]:
import jupyterImport
jupyterImport.enableJupyterImports()
from Utilities import find, genSamples, genSample, pluck, copyKeys, MultiReplace, allWith
from parse_q import reports, comments
from my_harvard_map import offerings
from copy import deepcopy

First, we need to link `offerings` and `reports` together. All we can really do is match `group + number` combinations. We'll need to manually account for group / department names that have changed over time. We should have those on this list:

In [6]:
groupReplace = MultiReplace({
  'AM-CIV': 'AMSTDIES',
  'CL': 'COMPLIT',
  'DRAMA': 'TDM',
  'HINDI': 'HIND-URD',
  'HUMFRAME': 'HUMAN',
  'HUMQSTNS': 'HUMAN',
  'IMUIL': 'IMMUN',
  'IRANCIV': 'IRANIAN',
  'LIT': 'COMPLIT,ENGLISH',
  'LITER': 'COMPLIT,ENGLISH',
  'LITERTRE': 'COMPLIT,ENGLISH',
  'MOR-REAS': 'ETHRSON',
  'PAL': 'PALI',
  'QNT-REAS': 'EMREAS',
  'SANSKRT': 'SANSKRIT',
  'SCIENCE': 'SCIPHUNV,SCILIVSY',
  'SOC-ANAL': 'SOCWORLD',
  'TAMIL': 'TAM',
  'TIBETAN': 'TIBET',
  'TIBHIST': 'TIBET',
  'URD': 'HIND-URD',
  'URDU': 'HIND-URD'
})

Which was figured out with the following:

In [None]:
### Run to inspect
qRepsByGroup = []
from collections import defaultdict
reportsByGroup = defaultdict(list)
for report, group in allWith('group', reports):
  reportsByGroup[group].append(report)
for key, reps in sorted(reportsByGroup.items()):
  srted = sorted([year for _, year in allWith('year', reps)], reverse=True)
  qRepsByGroup.append(' '.join([key, srted[0], str(len(reps)), reps[0]['title']]))
qRepsByGroup

Now we can match and assign `reports` to each `offering` that has them.

In [7]:
from collections import defaultdict
from operator import itemgetter

def buildQMatchTable(reports):
  qLookup = defaultdict(list)
  for report in reports:
    report['number'] = report['number'].split('.')[0] if report['group'] == 'EXPOS' else report['number']
    groups = groupReplace.sub(report['group']).split(',')
    for group in groups:
      qLookup[group + report['number']].append(report)
  return qLookup
    
def topReport(offering, reports):
  if 'profs' not in offering:
    reports[0]['relevantBecause'] = 'mostRecent'
    return reports[0], reports[1:]
  for profIndex, matchName in enumerate([prof['matchName'] for prof in offering['profs']]):
    for repI, report in enumerate(reports):
      if 'profs' not in report:
        continue
      for _ in [qProf for qProf in report['profs'] if qProf['matchName'] == matchName]:
        top = report
        top.update({'relevantBecause': 'profMatch_index=' + str(profIndex), 'matched': matchName})
        return top, reports[0:repI] + reports[repI+1:len(reports)]
  reports[0]['relevantBecause'] = 'mostRecent'
  return reports[0], reports[1:]

def linkReportsToOfferings(reports, offerings):
  qLookup = buildQMatchTable(reports)
  for offering in offerings:
    reports = qLookup[offering['group'] + offering['number']]
    if not reports:
      continue
    if offering['group'] == 'EXPOS':
      names = set([p['matchName'] for p in offering['profs']] if 'profs' in offering else [])
      newReports = []
      for report, profs in allWith('profs', reports):
        profs = [p for p in profs if p['matchName'] in names]
        if len(profs) > 0:
          newReport = deepcopy(report)
          newReport['profs'] = profs
          newReports.append(newReport)
      reports = newReports
      if len(reports) == 0:
        continue
    for report in reports:
      report['group'] = offering['group']
    reports.sort(key=itemgetter('term'))
    reports.sort(key=itemgetter('year'), reverse=True)
    top, rest = topReport(offering, reports)
    offering['topReport'] = top
    if len(rest) > 0:
      offering['reports'] = rest

In [10]:
linkReportsToOfferings(reports, offerings)

In [18]:
offerings[181]

{'classNumber': u'18007',
 'consentRequired': u'No Consent',
 'courseId': u'128024',
 'courseSite': u'https://locator.tlt.harvard.edu/course/colgsas-128024/2017/fall/18007',
 'crossReg': [u'Available for Harvard Cross Registration'],
 'departments': [u'General Education'],
 'description': u'Provides an introduction to contemporary American constitutional law, with a principal focus on decisions by the Supreme Court of the US. Topics to be studied include freedom of speech and religion, guarantees of due process and equal protection, and the powers of Congress and the courts.',
 'enrollmentCap': u'999',
 'examDateTime': u'12/11/2017 2:00 PM',
 'format': u'Lecture',
 'genEds': [u'Ethical Reasoning'],
 'gradingBasis': u'FAS Letter Graded',
 'group': u'GOV',
 'level': u'For Undergraduate and Graduate Students',
 'notes': u'This course, when taken for a letter grade, meets the General Education requirement for Ethical Reasoning.',
 'number': u'1510',
 'numberAlt': u'GOV1510',
 'numberInt': 