In [None]:
def zeroBy(zeroAt, x):
  slope = (1 / (1 - zeroAt))
  return slope*x - (slope*zeroAt - 1)

def sizeRelevance(offering, toReport, ofReport, widen=0):
  center = float(toReport['size'])
  size = float(ofReport['size'])
  if size <= center:
    return zeroBy(0.67 * (1 - widen), size / center)
  elif size > center:
    return zeroBy(1.5 * (1 + widen), size / center)

def workloadRelevance(offering, toReport, toScore, ofReport):
  try:
    ofScore = float(ofReport['responses']['workload']['score'])
  except:
    return 0.0
  if (not toScore or not ofScore) or (toReport['group'] != ofReport['group']):
    return 0.
  else:
    return zeroBy(1.5, (1 + abs(1 - toScore / ofScore)))
  
def titleRelevance(offering, toReport, ofReport):
  if offering['group'] == ofReport['group'] and offering['number'] == ofReport['number']:
    return -100.
  elif 'Research' in offering['title'] and 'Research' in ofReport['title']:
    return 1.
  return 0.0
  
def profsRelevance(offering, toReport, ofReport):
  toNumProfs = len(toReport['profs']) if 'profs' in toReport else -1
  ofNumProfs = len(ofReport['profs']) if 'profs' in ofReport else -1
  return 0.5 if toNumProfs == ofNumProfs else 0.0

import re
numMatcher = re.compile('\d\d*')
def numberRelevance(offering, toNumberInt, ofReport):
  matched = numMatcher.findall(ofReport['number'])
  ofNumberInt = int(matched[0]) if len(matched) > 0 else -1
  if ofNumberInt > 1000 and toNumberInt > 1000 and round(ofNumberInt / 1000.) == round(toNumberInt / 1000):
    return 0.75
  elif round(ofNumberInt / 100.) == round(toNumberInt / 100):
    return 0.75
  else:
    return 0.0

def groupRelevance(offering, ofReport):  
  return 2.0 if offering['group'] == ofReport['group'] else 0.0
  
def calcRelevance(offering, toReport, reps, similarity):
  toNumberInt = offering['numberInt']
  try:
    toScore = float(toReport['responses']['workload']['score'])
  except:
    toScore = None
  for ofReport in reps:
    relevance = sizeRelevance(offering, toReport, ofReport)
    relevance += titleRelevance(offering, toReport, ofReport)
    relevance += profsRelevance(offering, toReport, ofReport)
    relevance += numberRelevance(offering, toNumberInt, ofReport)
    relevance += groupRelevance(offering, ofReport)
    relevance += workloadRelevance(offering, toReport, toScore, ofReport)
    similarity[ofReport['reportId']] += relevance

def findSimilarReports(offering, report):
  from collections import defaultdict
  from operator import itemgetter
  similarity = defaultdict(lambda: 0.0)
  similarReports = {}
  primaryList = reportsByDepartment[offering['primaryDepartment']]
  for rep in primaryList:
    similarReports[rep['reportId']] = rep
  for otherDept in offering['departments']:
    otherList = reportsByDepartment[otherDept]
    for rep in otherList:
      similarity[rep['reportId']] += 1
      similarReports[rep['reportId']] = rep
  calcRelevance(offering, report, similarReports.values(), similarity)
  return [similarReports[reportId] for reportId, _ in sorted(similarity.items(), key=itemgetter(1), reverse=True)][0:50]
        
def genSimilarReportsByReportId(offerings, debug=False):
  similarRepsById = {}
  doneWith = 0
  for offering, topReport in allWith('topReport', offerings):
    if 'reports' in offering:
      reports = [topReport] + offering['reports']
    else:
      reports = [topReport]
    for report in reports:
      similar = findSimilarReports(offering, report)
      if not debug:
        similarRepsById[report['reportId']] = similar
      else:
        print report['group'], report['number'], report['title'], '(' + str(report['year']) + ')', 'Size:', report['size']
        print 'Departments:', offering['departments']
        print '-----'
        for score, rep in similar[0:50]:
          print 'Relevance:', '{0:0.2f}'.format(score), rep['group'], rep['number'], rep['title'], '(' + str(rep['year']) + ')'
          print '                              Size:', '{0:0.2f}'.format(sizeRelevance(offering, report, rep)), report['size'], rep['size'],
          print '  Profs:', profsRelevance(offering, report, rep),
          print '  Title:', titleRelevance(offering, report, rep),
          print '  Number:', numberRelevance(offering, rep), report['number'], rep['number'],
          print '  Group:', groupRelevance(offering, rep)
          print '  Workload:', workloadRelevance(offering, report, rep)
        from time import sleep
        sleep(1)
        break
    doneWith += 1
    if doneWith % 500 == 0:
      print 'Done with', doneWith
  return similarRepsById