In [1]:
import os
import re
import collections
import json
from functools import reduce

In [2]:
BASE = os.path.expanduser('~/github')
ORG = 'Nino-cunei'
REPO = 'oldbabylonian'
VERSION = '0.2'

REPO_PATH = f'{BASE}/{ORG}/{REPO}'

TRANS_DIR = f'{REPO_PATH}/sources/cdli/transcriptions/{VERSION}'

SOURCES = (
  'AbB-primary',
  'AbB-secondary',
)

SRC_EXT = '.txt'

REPORT_DIR = f'{REPO_PATH}/reports'

if not os.path.exists(REPORT_DIR):
  os.makedirs(REPORT_DIR, exist_ok=True)

In [40]:
transRe = re.compile(r'''^[0-9a-zA-Z'.]+\s+(.*)$''')
scoreRe = re.compile(r'''_([^_]*)_''')
numeralRe = re.compile(r'''[0-9]+\(([^)]+)\)''')
stickyNumeralRe = re.compile(r'''([0-9]+\([^)]+\)){2,}''')
splitRe = re.compile(r'''[ \t{}<>\[\].]+''')
wrongBeforeRe = re.compile(r'''[/|]''')
wrongAfterRe = re.compile(r'''[0-9/|][^0-9]''')

def numeralRepl(match):
  return f' {match.group(1)} '

maxReadingLength = 0

def readSourceFile(src, readings, errors):
  with open(f'{TRANS_DIR}/{src}{SRC_EXT}') as fh:
    inTrans = False
    l = 0
    for line in fh:
      l += 1
      if line.startswith('Transcription:'):
        inTrans = True
        continue
      elif line[0].isupper():
        inTrans = False
        continue
      line = line.strip()
      match = transRe.match(line)
      if not match:
        continue
      trans = match.group(1)
      nScores = trans.count('_')
      if nScores % 2:
        errors['unbalanced underscores'][src].add((l, line, None))
        continue
      scores = scoreRe.findall(trans)
      for score in scores:
        if score == '':
          errors['empty score __'][src].add((l, line, None))
        else:
          for rd in getReadings(score, src, l, line, errors):
            kind = 'lower' if rd.islower() else 'upper' if rd.isupper() else 'mixed'
            if kind == 'mixed':
              errors['mixed case in alternative'][src].add((l, line, rd))
              continue
            readings['alternative'][kind].add(rd)
      primary = scoreRe.sub('', trans)
      for rd in getReadings(primary, src, l, line, errors):
        kind = 'lower' if rd.islower() else 'upper' if rd.isupper() else 'mixed'
        if kind == 'mixed':
          errors['mixed case in default'][src].add((l, line, rd))
          continue
        readings['default'][kind].add(rd)
      
          
def getReadings(material, src, l, line, errors):
  global maxReadingLength
  
  words = splitRe.split(material)
  readings = reduce(set.union, (set(word.split('-')) for word in words), set())
  
  thisMax = max(len(rd) if rd.isalnum() else 1 for rd in readings)
  if thisMax > maxReadingLength:
    maxReadingLength = thisMax
  return filterReadings(readings, src, l, line, errors)

def filterReadings(rds, src, l, line, errors):
  newRds = set()
  for rd in rds:
    if wrongBeforeRe.search(rd):
      errors['A malformed reading before weeding'][src].add((l, line, rd))
      continue
    rd = (
      rd.\
      replace('#', '').\
      replace('.', '').\
      replace('?', '').\
      replace('!', '').\
      replace('$', '').\
      replace('*', '').\
      replace('+', '')
    )
    if stickyNumeralRe.search(rd):
        errors['adjacent numerals'][src].add((l, line, rd))
    rd = numeralRe.sub(numeralRepl, rd)
    rd = rd.replace('(', ' ').replace(')', ' ')
    subrds = rd.strip().split()
    
    for srd in subrds:
      if wrongAfterRe.search(srd):
        rdRep = f'"{srd}" in "{rd}"' if len(rd) > 1 else "{srd}"
        errors['Z malformed reading after weeding'][src].add((l, line, rdRep))
        continue
      if len(srd) > 7:
        rdRep = f'"{srd}" in "{rd}"' if len(rd) > 1 else "{srd}"
        errors['long reading'][src].add((l, line, rdRep))
        continue
      if srd != '':
        newRds.add(srd)
  return newRds

def showErrors(errors, batch=10):
  if not errors:
    print('No errors')
  else:
    for (error, srcs) in sorted(errors.items()):
      print(f'ERROR {error}')
      for (src, data) in sorted(srcs.items()):
        print(f'\t{src} ({len(data)}x)')
        for (l, line, sore) in sorted(data)[0:batch]:
          soreRep = '' if sore is None else f'"{sore}" in '
          print(f'\t\t{l}: {soreRep}{line}')
        if len(data) > batch:
          print(f'\t\t + more')

def printErrors(errors):
  outFile = f'{REPORT_DIR}/errors.tsv'
  if os.path.exists(outFile):
    os.unlink(outFile)
  with open(outFile, 'w') as fh:
    fh.write('\t'.join(f'''
    error
    sourcefile
    lineno
    wrong
    line
'''.strip().split()))
    fh.write('\n')
    for (error, srcs) in sorted(errors.items()):
      for (src, data) in sorted(srcs.items()):
        for (l, line, sore) in sorted(data):
          soreRep = '' if sore is None else sore
          fh.write('\t'.join((error, src, str(l), soreRep, line)))
          fh.write('\n')

def showReadings(msg, readings, batch=20):
  print(f'''
================================================
= {msg} max reading length is {maxReadingLength}
================================================
''')
  totals = collections.Counter()
  for (cls, clsItems) in readings.items():
    for (kind, kindItems) in clsItems.items():
      totals[cls] += len(kindItems)
  for (cls, clsItems) in readings.items():
    print(f'{cls:<15}: {totals[cls]:>4} readings')
    for (kind, kindItems) in clsItems.items():
      print(f'\t{kind:<15}: {len(kindItems):>4} readings')
      if batch:
        for it in sorted(kindItems)[0:batch]:
          print(f'\t\t{it}')
        if len(kindItems) > batch:
          print(f'\t\t + more')
          
def printReadings(readings):
  xReadings = set()
  for (cls, clsItems) in readings.items():
    for (kind, kindItems) in clsItems.items():
      for it in kindItems:
        xReadings.add((kind, it))
        
  outFile = f'{REPORT_DIR}/readings.tsv'
  if os.path.exists(outFile):
    os.unlink(outFile)
  with open(outFile, 'w') as fh:
    fh.write('\t'.join(f'''
    kind
    reading
'''.strip().split()))
    fh.write('\n')
    for (kind, rd) in sorted(xReadings):
      fh.write('\t'.join((kind, rd)))
      fh.write('\n')

In [41]:
readings = collections.defaultdict(lambda: collections.defaultdict(set))
errors = collections.defaultdict(lambda: collections.defaultdict(set))
maxReadingLength = 0

for src in SOURCES:
  readSourceFile(src, readings, errors)

In [42]:
printErrors(errors)
printReadings(readings)

In [43]:
showErrors(errors)
showReadings('Readings', readings, batch=100)

ERROR A malformed reading before weeding
	AbB-primary (49x)
		7549: "ta/sza" in 12. la ki a ta/sza li# ki x
		7618: "5/6(disz)" in 11. _1(disz) 5/6(disz) gin2 ku3-babbar_ sza _igi-6(disz)-gal2 ku3-babbar zu2-lum dilmun-na_
		8531: "KU/MA" in 5. x x ni-su2-uq KU/MA x [...]
		8936: "1/3(disz)" in 8. _1/3(disz) ma-na ku3-babbar_
		9198: "1/3(disz)" in 9. _1/3(disz) ma-na 1(disz) gin2 ku3-babbar na4_ {d}utu
		9499: "2/3(disz)" in 4. _3(u) 2/3(disz) ma-na 8(disz) gin2 an-na-kam_
		9555: "1/3(disz)" in 1. _1/3(disz) ma-na 5(disz) gin2 ku3-babbar_
		9882: "ir/ni" in 25. u3 _a-sza3_ DISZ sza ta mi ir/ni ia a-di [u2-ul]
		10085: "1/2(disz)" in 6. _1/2(disz) gin2 1(u) sze ku3-babbar na4_ {d}utu usz-ta-bi-la#-[kum]
		10579: "1/2(disz)" in 3. _2(disz) 1/2(disz) ma-na 1(disz) 1/2(disz) gin2 ku3-babbar_
		 + more
	AbB-secondary (72x)
		2204: "1/2(disz)" in 7. _3(disz) 1/2(disz) ma-na ku3-babbar_
		2206: "1/2(disz)" in 9. _3(disz) 1/2(disz) ma-na ku3-babbar_
		2905: "2/3(disz)" in 5. _2/3(disz) ma-na

In [39]:
numeralRe = re.compile(r'''([0-9]+\([^)]+\))''')
stickyNumeralRe = re.compile(r'''((?:[0-9]+\([^)]+\)){2,})''')

def stickyNumeralRepl(match):
  print(match.groups()[0])
  print(numeralRe.findall(match.groups()[0]))
  print(numeralRe.findall(match.groups()[0]))
  return ' '.join(numeralRe.findall(match.groups()[0]))

x = 'gur-1(a)2(b)3(c)-ki'
stickyNumeralRe.sub(stickyNumeralRepl, x)

1(a)2(b)3(c)
['1(a)', '2(b)', '3(c)']
['1(a)', '2(b)', '3(c)']


'gur-1(a) 2(b) 3(c)-ki'