# my.harvard Mappers
Below are all of the mappings from the raw my.harvard scrape to the parsed offerings objects in the database. Each one is broken into its own cell to make testing easier. For testing, `genSamples`, `genSample`, and `Mapper`'s `uniques` method come in handy.

In [1]:
from jupyterImport import enableJupyterImports
enableJupyterImports()
from Utilities import find, Mapper, MultiReplace, genSamples, genSample, name, analyze
from load_raw_data import myHarvard
import sys, json

In [2]:
deptReplace = MultiReplace({
  '&': 'and', 
  'Biol$': 'Biology', 
  ' Sci ': ' Science ', 
  ' Chem ': ' Chemistry ',
  ' Amer ': ' American ',
  '^Biolog ': 'Biological ',
  ' Bio ': ' Biology ',
  'Med$': 'Medicine',
  'Pharm$': 'Pharmacology',
  'GenEd: ': '',
  'Hlth$': 'Health',
  'Aesthetic and Interpretive': 'Aesthetic and Interpretive Understanding',
  'Pol$': 'Policy',
  ' Regen ': ' Regenerative ',
  'Science of Physical Universe': 'Science of the Physical Universe',
  '^E Asian': 'East Asian',
  'Math Reasoning': 'Mathematical Reasoning',
  'E Europe and Cntrl Asia': 'Eastern Europe, and Central Asia',
  'E Europe, Central Asia': 'Eastern Europe, and Central Asia',
  'Biomedical Sci': 'Biomedical Sciences',
  '^Adv Stud': 'Advanced Studies',
  'and Civ': 'and Civilizations',
  ' Langs ': ' Languages ',
  'Lit$': 'Literature',
  'Arch, Landscape and Urban Plan': 'Architecture, Landscape, and Urban Planning',
  'Envi Science': 'Environmental Science'
})

In [3]:
def mapDepartments(inputs):
  depts, primary = inputs
  depts = [primary] + (depts if isinstance(depts, list) else [depts])
  depts = [deptReplace.sub(dept) for dept in depts]
  return {'departments': depts}
departmentsMapper = Mapper(['IS_SCL_DESCR_IS_SCL_DESCRM', 'IS_SCL_DESCR_IS_SCL_DESCRD'], mapDepartments)

In [4]:
prepReplace = MultiReplace({
  'Prerequisite: ': '',
  '^\*': '',
  '\.$': ''
})
def mapRecommendedPrep(prep):
  return {'recommendedPrep': prepReplace.sub(prep)} if prep else {}
recommendedPrepMapper = Mapper('RecPrepDescr', mapRecommendedPrep)

In [5]:
consentRequiredMapper = Mapper('IS_SCL_DESCRSHORT_HU_CONSENT', Mapper.noChange('consentRequired'))

In [6]:
formatMap = {
  'Read Rsch': 'Reading and Research',
  'Doc Diss': 'Doctoral Dissertation',
  'ReadingCrs': 'Reading Course',
  'Fresh Sem': 'Freshman Seminar',
  'JrTutorial': 'Junior Tutorial',
  'SrTutorial': 'Senior Tutorial',
  'Rsrch Wksh': 'Research Workshop',
  'Rsrch Sem': 'Research Seminar',
  'Soph Tutor': 'Sophomore Tutorial',
  'Lab Rsrch': 'Lab Research',
  'Sem Wrkshp': 'Seminar Workshop',
  'House Sem': 'House Seminar',
  'Ind Study': 'Independent Study',
  'Rsch': 'Research',
  'LW': 'Lecture/Workshop'
}
def mapFormat(frmt):
  return {'format': formatMap[frmt] if frmt in formatMap else frmt} if frmt else {}
formatMapper = Mapper('SSR_COMPONENT_NAME', mapFormat)

In [7]:
notesMapper = Mapper('HU_COURSE_PREQ', Mapper.noChange('notes'))

In [8]:
gradingBasisMapper = Mapper('IS_SCL_DESCR100_HU_SCL_GRADE_BASIS', Mapper.noChange('gradingBasis'))

In [9]:
unitsMapper = Mapper('UnitsDisplay', Mapper.noChange('units'))

In [10]:
classNumberMapper = Mapper('CLASS_NBR', Mapper.noChange('classNumber'))

In [11]:
courseIdMapper = Mapper('CRSE_ID', Mapper.noChange('courseId'))

In [12]:
def mapEnrollmentCap(cap):
  if not cap or cap == '999' or cap == '9999':
    return {}
  else:
    return {'enrollmentCap': int(cap)}
enrollmentCapMapper = Mapper('ENRL_CAP', mapEnrollmentCap)

In [13]:
groupMapper = Mapper('Subject', Mapper.noChange('group'))

In [14]:
titleReplace = MultiReplace({'Expository Writing 20: ': '' })
def mapTitle(inputs):
  (title, expos) = inputs
  fullTitle = title + ': ' + expos if expos else title
  return {'title': titleReplace.sub(fullTitle)}
titleMapper = Mapper(['Title', 'DESCRFORMAL_COURSE_TOPIC'], mapTitle)

In [15]:
def mapGenEds(genEds):
  return {'genEds': genEds if isinstance(genEds, list) else [genEds]} if genEds else {}
genEdsMapper = Mapper('IS_SCL_DESCR100_HU_SCL_ATTR_GE', mapGenEds)

In [16]:
levelMap = {
  'Graduate Course': 'Grad',
  'Primarily for Undergraduate Students': 'Mainly Undergrad',
  'For Undergraduate and Graduate Students': 'Grad or Undergrad',
  'Primarily for Graduate Students': 'Mainly Grad'
}
def mapLevel(level):
  return {'level': levelMap[level] if level in level else level} if level else {}
levelMapper = Mapper('IS_SCL_DESCR100_HU_SCL_ATTR_LEVL', Mapper.noChange('level'))

In [17]:
def mapCourseSite(site):
  return {} if site == 'NOURL' else {'courseSite': site}
courseSiteMapper = Mapper('URL_URLNAME', mapCourseSite)

In [18]:
def mapCrossReg(crossReg):
  return {'crossReg': crossReg if isinstance(crossReg, list) else [crossReg]} if crossReg else {}
crossRegMapper = Mapper('IS_SCL_DESCR100_HU_SCL_ATTR_XREG', mapCrossReg)

In [19]:
def shorten(year):
  return "'20" if year == '2020' else year.replace('20', "'")
def mapTermYear(inputs):
  (normal, likely) = inputs
  if not likely:
    if normal == 'Not Offered':
      return {'termYear': 'Not Offered'}
    else:
      year, term = tuple(normal.split(' '))
      return {'term': term, 'year': year, 'termYear': term + ' ' + shorten(year)}
  else:
    year, term = tuple(likely.split(' '))
    return {'likelyTerm': term, 'likelyYear': year, 'termYear': 'Likely ' + term + ' ' + shorten(year)}
termYearMapper = Mapper(['IS_SCL_DESCR_IS_SCL_DESCRH', 'IS_SCL_MEETING_PAT_HU_LIKELY_OFFERED'], mapTermYear)

In [20]:
descReplace = MultiReplace({'<p>': '', '</p>': ''})
def mapDescription(desc):
  return {'description': descReplace.sub(desc)} if desc else {}
descriptionMapper = Mapper('IS_SCL_DESCR', mapDescription)

In [21]:
import re
def mapNumber(inputs):
  (number, group) = inputs
  matched = re.compile('\d\d*').findall(number)
  return {
  'number': number,
  'numberInt': int(matched[0]) if len(matched) > 0 else -1
}
numberMapper = Mapper(['CATALOG_NBR', 'Subject'], mapNumber)

In [22]:
def mapProfs(profs):
  if not profs:
    return {}
  if not isinstance(profs, list):
    print 'not list -------'
    print json.dumps(profs, indent=2)
  def mapProf(prof):
    fullName = prof['Name']
    return {'displayName': name(fullName), 'matchName': name(fullName, match=True)}
  return {'profs': [mapProf(p) for p in profs]}
profsMapper = Mapper('DESCRLONG_DETAILS', mapProfs)

In [23]:
def mapObjectID(inputs):
  (classNumber, courseId) = inputs
  classNumber = '0' if not classNumber else classNumber
  return {'objectID': '_'.join([courseId, classNumber])}
objectIDMapper = Mapper(['CLASS_NBR', 'CRSE_ID'], mapObjectID)   

In [24]:
def mapMinutes(minutes):
  if minutes == 59:
    return 1.0
  if minutes == 29:
    return 0.5
  if minutes == 15:
    return 0.25
  else:
    return minutes / 60.0
  
def mapTime(time):
  if time is None:
    return None
  pm = 'pm' in time
  time = time.replace('am', '').replace('pm', '')
  if ':' not in time or len(time.split(':')) != 2:
    raise
  hours, minutes = tuple([int(x) for x in time.split(':')])
  hours = hours + 12 if pm and hours < 12 else hours
  return float(hours) + mapMinutes(minutes)
    
daysReplace = MultiReplace({'Mo': 'M', 'We': 'W', 'Fr': 'F'})
def mapDays(days):
    return daysReplace.sub(days).split(' ')

def assignIf(dictionary):
  def assign(key, value):
    if value:
      dictionary[key] = value
  return assign

locReplace = MultiReplace({
  ' \(FAS\)': '',
  ' \(SEAS\)': '',
  ' \(HLS\)': '',
  'Bldg': 'Building', 
  'Ctr': 'Center',
  ' \(HDS\)': '',
  ' \(GSD\)': ''
})
def mapLocName(name):
  return locReplace.sub(name) if name else None

def mapLatLong(latLong):
  return latLong if latLong != '0' else None

def mapSession(inputs):
  (start, end, days, locName, locNumber, latitude, longitude) = inputs
  session = {}; seshSet = assignIf(session)
  start, end = mapTime(start), mapTime(end)
  if start and end:
    session['time'] = {'start': start, 'end': end}
  seshSet('days', mapDays(days)) 
  location = {}; locSet = assignIf(location)
  locSet('name', mapLocName(locName))
  locSet('number', locNumber)
  locSet('longitude', mapLatLong(longitude))
  locSet('latitude', mapLatLong(latitude))
  if len(location.keys()) > 0:
    session['location'] = location
  return session

sessionKeys = [
  'IS_SCL_TIME_START', 
  'IS_SCL_TIME_END', 
  'IS_SCL_MEETING_PAT', 
  'IS_SCL_DESCR_IS_SCL_DESCRG',
  'BLDG_CD',
  'HU_LATITUDE',
  'HU_LONGITUDE'
]
def _mapSessions(inputs):
  (multiSection, start, end, days, locNameOrNames, locNumber, latitude, longitude) = inputs
  if days == 'TBA':
    return None
  if multiSection:
    sessions = []
    locNameOrNames = locNameOrNames if locNameOrNames else [None] * len(multiSection)
    locNames = locNameOrNames if isinstance(locNameOrNames, list) else [locNameOrNames] * len(multiSection)
    for session, locName in zip(multiSection, locNames):
      mapped = {}
      session['IS_SCL_DESCR_IS_SCL_DESCRG'] = locName
      Mapper(sessionKeys, mapSession).map(session, mapped)
      sessions.append(mapped)
    return sessions
  elif days is not None:
    return [mapSession((start, end, days, locNameOrNames, locNumber, latitude, longitude))]
  else:
    return None
def mapSessions(inputs):
  sessions = _mapSessions(inputs)
  if sessions:
    sessionsDict = {'sessions': sessions}
    try:
      sessionsDict['sessionString'] = ', '.join(sessions[0]['days'])
    except:
      pass
    return sessionsDict
  else:
    return {}
    
sessionsMapper = Mapper(['MultiSection'] + sessionKeys, mapSessions)

In [25]:
examDateTimeMapper = Mapper('IS_SCL_DESCR_HU_SCL_EXAM_GROUP', Mapper.noChange('examDateTime'))

In [26]:
enrollmentStatusMapper = Mapper('EnrlDisplay', Mapper.noChange('enrollmentStatus'))

In [27]:
mappers = [
  departmentsMapper,
  recommendedPrepMapper,
  consentRequiredMapper,
  formatMapper,
  notesMapper,
  gradingBasisMapper,
  unitsMapper,
  classNumberMapper,
  courseIdMapper,
  enrollmentCapMapper,
  groupMapper,
  titleMapper,
  genEdsMapper,
  levelMapper,
  courseSiteMapper,
  crossRegMapper,
  termYearMapper,
  descriptionMapper,
  numberMapper,
  profsMapper,
  objectIDMapper,
  sessionsMapper,
  examDateTimeMapper,
  enrollmentStatusMapper
]

In [28]:
rawOfferings = myHarvard()

In [29]:
offerings = []
for raw in rawOfferings:
  parsed = {}
  for mapper in mappers:
    mapper.map(raw, to=parsed)
  offerings.append(parsed)

In [30]:
# len(offerings)

13654

In [32]:
# sessions = []
# for offering in offerings:
#   if 'sessions' in offering:
#     sessions += offering['sessions']
# analyze('location', sessions)

Is optional? True 2307 of 4408
Counter({<type 'dict'>: 2307})
[('{"latitude": "42.375885", "name": "Gund 111", "longitude": "-71.113862", "number": "02240"}', 34), ('{"latitude": "42.335303", "name": "Kresge 200 (HSPH)", "longitude": "-71.102839", "number": "02790"}', 29), ('{"latitude": "42.335575", "name": "FXB G11 (HSPH)", "longitude": "-71.10173", "number": "02720"}', 28), ('{"latitude": "42.37141", "name": "Belfer 200 Starr Aud (HKS)", "longitude": "-71.121541", "number": "02112"}', 26), ('{"latitude": "42.371036", "name": "Littauer Building 140 (HKS)", "longitude": "-71.122057", "number": "02111"}', 25), ('{"latitude": "42.37141", "name": "Belfer 400 Land Hall (HKS)", "longitude": "-71.121541", "number": "02112"}', 24), ('{"latitude": "42.371036", "name": "Littauer Building L230 (HKS)", "longitude": "-71.122057", "number": "02111"}', 24), ('{"latitude": "42.371036", "name": "Littauer Building 280 (HKS)", "longitude": "-71.122057", "number": "02111"}', 23), ('{"latitude": "42.3353