# Merge Scrapes
Start by importing some utilities and the raw data:

In [1]:
import jupyterImport
jupyterImport.enableJupyterImports()
from Utilities import find, genSamples, genSample, pluck, copyKeys, MultiReplace, allWith, analyze
from parse_q import reports, comments
from my_harvard_map import offerings
from copy import deepcopy

First, we need to link `offerings` and `reports` together. All we can really do is match `group + number` combinations. We'll need to manually account for group / department names that have changed over time. We should have those on this list:

In [2]:
groupReplace = MultiReplace({
  'AM-CIV': 'AMSTDIES',
  'CL': 'COMPLIT',
  'DRAMA': 'TDM',
  'HINDI': 'HIND-URD',
  'HUMFRAME': 'HUMAN',
  'HUMQSTNS': 'HUMAN',
  'IMUIL': 'IMMUN',
  'IRANCIV': 'IRANIAN',
  'LIT': 'COMPLIT,ENGLISH',
  'LITER': 'COMPLIT,ENGLISH',
  'LITERTRE': 'COMPLIT,ENGLISH',
  'MOR-REAS': 'ETHRSON',
  'PAL': 'PALI',
  'QNT-REAS': 'EMREAS',
  'SANSKRT': 'SANSKRIT',
  'SCIENCE': 'SCIPHUNV,SCILIVSY',
  'SOC-ANAL': 'SOCWORLD',
  'TAMIL': 'TAM',
  'TIBETAN': 'TIBET',
  'TIBHIST': 'TIBET',
  'URD': 'HIND-URD',
  'URDU': 'HIND-URD'
})

Which was figured out with the following:

In [3]:
### Run to inspect
qRepsByGroup = []
from collections import defaultdict
reportsByGroup = defaultdict(list)
for report, group in allWith('group', reports):
  reportsByGroup[group].append(report)
for key, reps in sorted(reportsByGroup.items()):
  srted = sorted([year for _, year in allWith('year', reps)], reverse=True)
  qRepsByGroup.append(' '.join([key, srted[0], str(len(reps)), reps[0]['title']]))
qRepsByGroup

[u'AESTHINT 2016 135 Modern Jewish Literature',
 u'AFRAMER 2016 489 Malagasy',
 u'AKKAD 2016 9 Intermediate Babylonian',
 u'AM-CIV 2012 6 Major Works in the History of American Civilization',
 u'AMSTDIES 2016 5 Hyphen-Nation',
 u'ANE 2016 26 The History and Archaeology of Jerusalem',
 u'ANTHRO 2016 519 Junior Tutorial in Social Anthropology',
 u'APCOMP 2016 20 Computational Fluid Dynamics',
 u'APMTH 2016 189 Mathematical Methods in the Sciences',
 u'APPHY 2016 103 Introduction to Solid State Physics',
 u'ARABIC 2016 154 Elementary Arabic',
 u'ARAMAIC 2012 3 Introduction to Babylonian Aramaic',
 u'ARMEN 2016 1 Elementary Modern Eastern Armenian',
 u'ARMENST 2016 1 Armenian Literature in Translation',
 u'ASTRON 2016 182 Radiative Processes in Astrophysics',
 u'BBS 2016 16 Analysis of the Biological Literature',
 u'BCMP 2016 81 Principles of Human Disease',
 u'BCS 2016 16 Elementary Bosnian/Croatian/Serbian',
 u'BE 2016 19 Cellular Engineering',
 u'BIOLOGY 2007 5 Behavioral Ecology',
 u'B

Now we can match and assign `reports` to each `offering` that has them.

In [4]:
from collections import defaultdict
from operator import itemgetter

def buildQMatchTable(reports):
  qLookup = defaultdict(list)
  for report in reports:
    report['number'] = report['number'].split('.')[0] if report['group'] == 'EXPOS' else report['number']
    groups = groupReplace.sub(report['group']).split(',')
    for group in groups:
      qLookup[group + report['number']].append(report)
  return qLookup
    
def topReport(offering, reports):
  if 'profs' not in offering:
    reports[0]['relevantBecause'] = 'mostRecent'
    return reports[0], reports[1:]
  for profIndex, matchName in enumerate([prof['matchName'] for prof in offering['profs']]):
    for repI, report in enumerate(reports):
      if 'profs' not in report:
        continue
      for _ in [qProf for qProf in report['profs'] if qProf['matchName'] == matchName]:
        top = report
        top.update({'relevantBecause': 'profMatch_index=' + str(profIndex), 'matched': matchName})
        return top, reports[0:repI] + reports[repI+1:len(reports)]
  reports[0]['relevantBecause'] = 'mostRecent'
  return reports[0], reports[1:]

def linkReportsToOfferings(reports, offerings):
  qLookup = buildQMatchTable(reports)
  for offering in offerings:
    reports = qLookup[offering['group'] + offering['number']]
    if not reports:
      continue
    if offering['group'] == 'EXPOS':
      names = set([p['matchName'] for p in offering['profs']] if 'profs' in offering else [])
      newReports = []
      for report, profs in allWith('profs', reports):
        profs = [p for p in profs if p['matchName'] in names]
        if len(profs) > 0:
          newReport = deepcopy(report)
          newReport['profs'] = profs
          newReports.append(newReport)
      reports = newReports
      if len(reports) == 0:
        continue
    for report in reports:
      report['group'] = offering['group']
    reports.sort(key=itemgetter('term'))
    reports.sort(key=itemgetter('year'), reverse=True)
    top, rest = topReport(offering, reports)
    offering['topReport'] = top
    if len(rest) > 0:
      offering['reports'] = rest

In [5]:
linkReportsToOfferings(reports, offerings)

In [6]:
import operator

def getPrimaryReason(offering):
  try:
    primaryReason = sorted(offering['topReport']['reasons'].items(), key=operator.itemgetter(1)).pop()[0]
  except:
    return {}
  displayMap = {
    'elective': 'Elective',
    'concentration': 'Concentration',
    'secondary': 'Secondary or Citation',
    'genEd': 'Gen Ed',
    'expos': 'Expos',
    'language': 'Language',
    'preMed': 'Pre-Med'
  }
  return {'primaryReason': displayMap[primaryReason]}

In [7]:
for offering, topReport in allWith('topReport', offerings):
  primaryReason = getPrimaryReason(offering)
  offering.update(primaryReason)