# Mapping Old Babylonian readings to Unicode

## Task

We want to map the *readings* in the Old Babylonian Corpus to unicode strings in the Cuneiform block,
based on extant mapping tables.

## Problem

There are multiple mapping tables, there are several ways to transliterate readings.

## Sources

We take the ATF transliterations from CDLI, for tablets found by a search on AbB and Old Babylonian.

We take the file
[GeneratedSignList.json](https://github.com/Nino-cunei/oldbabylonian/blob/master/sources/writing/GeneratedSignList.json)
with mappings like

```json
        "BANIA": {
            "signName": "BANIA",
            "signNumber": 551,
            "signCunei": "𒑔",
            "codePoint": "",
            "values":
			[
                "BANIA", "AŠ2.UoverU", "5SŪTU"
            ]
        },
        "MA": {
            "signName": "MA",
            "signNumber": 552,
            "signCunei": "𒈠",
            "codePoint": "",
            "values":
			[
                "MA", "PEŠ3", "PEŠŠE", "WA6"
            ]
        },
```

Thanks to
[Auday Hussein](https://www.linkedin.com/in/audayhussein/?originalSubdomain=ca)
for helpfully sending this file.

# Unmapped readings

In [18]:
print(f'{len(unmapped):>3} unmapped readings')
sorted(unmapped)

 80 unmapped readings


['...szu',
 '...x',
 'ah',
 'alamusz',
 'asal2',
 'babila2',
 'barig',
 "bur'u",
 'd',
 'dah',
 'di...',
 'duh',
 'e2ni',
 'eh',
 'eri11',
 'gazx',
 "gesz'u",
 'geszimmar',
 'geszu',
 'gudu4',
 'ha',
 'ha:a',
 'had2',
 'hal',
 'har',
 'he',
 'he2',
 'hi',
 'hu',
 'hub2',
 'hun',
 'hur',
 'ih',
 'inana',
 'isx',
 'isztar',
 'itu',
 'kislah',
 'kux',
 'lah',
 'lah4',
 'lah5',
 'lah6',
 'lal3',
 'm',
 "ma'",
 'mah',
 'muhaldim',
 'n',
 'nigar',
 'nirah',
 'p',
 'pesz2',
 'sa10',
 'sahar',
 'siskur2',
 'sza3}',
 'szagina',
 'szah',
 'szah2',
 'szandana',
 'sze9',
 'szii',
 'szunigin',
 'tah',
 'tap',
 'udru',
 'uh',
 'uh2',
 'uh3',
 'ukken',
 'unu',
 'ura',
 'ururdu',
 'utumu',
 'x...',
 'xxxx',
 '{a',
 '{diszszu',
 '{ki']

# Ambiguously mapped readings

In [19]:
print(f'{len(multiple):>3} ambiguously mapped readings')
for r in sorted(multiple):
  unis = multiple[r]
  uniStr = ' - '.join(sorted(unis))
  print(f'{r} => ({len(unis)}) => {uniStr}')

 41 ambiguously mapped readings
ba4 => (3) => 𒀀𒀭𒂷 - 𒂷 - 𒍝𒂷𒂷
ba6 => (2) => 𒁀𒌑 - 𒌑
bara2 => (2) => 𒁁 - 𒁈
buru14 => (2) => 𒂘 - 𒂙
dabin => (2) => 𒂠𒊺 - 𒍥𒊺
dilmun => (3) => 𒉌𒌇 - 𒊩𒄸 - 𒊩𒌇
eri => (2) => 𒅕 - 𒌷
erisz => (2) => 𒊩𒈠 - 𒊩𒌆
gala => (3) => 𒃲 - 𒍑𒆪 - 𒍓
gin7 => (2) => 𒁶 - 𒄀
gurusz => (2) => 𒄨 - 𒆗
ia => (2) => 𒅀 - 𒉿
idim => (2) => 𒁁 - 𒅂
ii => (2) => 𒅀 - 𒉿
il => (2) => 𒀧 - 𒅋
iri => (2) => 𒅕 - 𒌷
isz8 => (2) => 𒀹 - 𒌋
iu => (2) => 𒅀 - 𒉿
kam => (2) => 𒄭𒁁 - 𒄰
kesz2 => (2) => 𒂡 - 𒆟
kesz3 => (2) => 𒋙𒀭𒄲 - 𒋙𒀭𒄲𒆠
lum => (2) => 𒈝 - 𒋞
munu4 => (2) => 𒉽𒉽 - 𒉽𒊺𒉽
ne3 => (2) => 𒄊 - 𒊊
nergal => (2) => 𒄊𒀕𒃲 - 𒊊𒀕𒃲
pa2 => (2) => 𒁀 - 𒁀𒌑
pirig => (2) => 𒄊 - 𒊊
puzur4 => (2) => 𒅤𒊭 - 𒆃𒊭
sig17 => (2) => 𒄀 - 𒆬
sze20 => (2) => 𒂠 - 𒅆
t,a2 => (2) => 𒋫 - 𒋬
til => (2) => 𒁁 - 𒌀
us => (2) => 𒊻 - 𒍖
usa => (2) => 𒐍 - 𒑄
usz => (2) => 𒍑 - 𒍖
usz2 => (2) => 𒁁 - 𒍗
uz => (2) => 𒊻 - 𒍖
wa => (2) => 𒁀 - 𒉿
wa2 => (2) => 𒁀 - 𒉌
zi3 => (2) => 𒂠 - 𒍥
ziz2 => (2) => 𒀾 - 𒍩


# Uniquely mapped readings

In [20]:
print(f'{len(unique):>3} uniquely mapped readings')
for r in sorted(unique):
  print(f'{r:>10} => {unique[r]}')

624 uniquely mapped readings
         a => 𒀀
        a2 => 𒀉
        ab => 𒀊
       ab2 => 𒀖
      abul => 𒆍𒃲
      abzu => 𒍪𒀊
        ad => 𒀜
        ag => 𒀝
       ag2 => 𒉘
       aga => 𒂆
     agrig => 𒅆𒁾
        ak => 𒀝
    akszak => 𒌔
        al => 𒀠
      alan => 𒀩
        am => 𒄠
       am3 => 𒀀𒀭
       ama => 𒂼
      amar => 𒀫
        an => 𒀭
     ansze => 𒄏
        ap => 𒀊
      apin => 𒀳
        aq => 𒀝
        ar => 𒅈
       ar3 => 𒄯
        as => 𒊍
       as, => 𒊍
       as2 => 𒀾
      asal => 𒍂
      asar => 𒍂
       asz => 𒀸
      asz2 => 𒀾
     asza5 => 𒃷
    aszgab => 𒀿
    asznan => 𒊺𒌁
        at => 𒀜
       at, => 𒀜
        az => 𒊍
       az2 => 𒀾
       az3 => 𒀸
    azlag2 => 𒌆
        ba => 𒁀
    babbar => 𒌓
      bad3 => 𒂦
       bal => 𒁄
      bala => 𒁄
       ban => 𒉼
      ban2 => 𒑏
      ban3 => 𒌉
    banda3 => 𒌉
    banesz => 𒑑
   banszur => 𒍎
    bappir => 𒋋
       bar => 𒁇
       bat => 𒁁
        be => 𒁁
       be2 => 𒁉
        bi => 𒁉
       bi2 => 𒉈
      

In [1]:
import os
import collections
import re
import json

from tf.fabric import Fabric

In [2]:
BASE = os.path.expanduser('~/github')
ORG = 'Nino-cunei'
REPO = 'oldbabylonian'
VERSION = '0.3'

REPO_DIR = f'{BASE}/{ORG}/{REPO}'

TF_DIR = f'{REPO_DIR}/tf/{VERSION}'
WRITING_DIR = f'{REPO_DIR}/sources/writing'

SIGN_FILE = 'GeneratedSignList.json'
SIGN_PATH = f'{WRITING_DIR}/{SIGN_FILE}'

In [3]:
TF = Fabric(TF_DIR)
allFeatures = TF.explore(silent=True, show=True)
loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
api = TF.load(loadableFeatures, silent=False)
api.makeAvailableIn(globals())

This is Text-Fabric 7.4.5
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

25 features found and 0 ignored
  0.00s loading features ...
   |     0.00s B otype                from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.08s B oslots               from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.00s B pnumber              from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.05s B type                 from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.03s B after                from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.05s B atf                  from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.01s B grapheme             from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.05s B reading              from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0.3
   |     0.00s B collated             from /Users/dirk/github/Nino-cunei/oldbabylonian/tf/0

[('Computed',
  'computed-data',
  ('C Computed', 'Call AllComputeds', 'Cs ComputedString')),
 ('Features', 'edge-features', ('E Edge', 'Eall AllEdges', 'Es EdgeString')),
 ('Fabric', 'loading', ('ensureLoaded', 'TF', 'ignored', 'loadLog')),
 ('Locality', 'locality', ('L Locality',)),
 ('Misc', 'messaging', ('cache', 'error', 'indent', 'info', 'reset')),
 ('Nodes',
  'navigating-nodes',
  ('N Nodes', 'sortKey', 'sortKeyTuple', 'otypeRank', 'sortNodes')),
 ('Features',
  'node-features',
  ('F Feature', 'Fall AllFeatures', 'Fs FeatureString')),
 ('Search', 'search', ('S Search',)),
 ('Text', 'text', ('T Text',))]

In [4]:
transUni = {
    'sz': 'š',
    's,': 'ṣ',
    "s'": 'ś',
    't,': 'ṭ',
    'h,': 'ḫ',
}

transAscii = {rout.upper(): rin for (rin, rout) in transUni.items()}

def makeUni(r):
  for (rin, rout) in transUni.items():
    r = r.replace(rin, rout)
  return (
    r.\
    replace("'", '').\
    replace('{', '').\
    replace('}', '').\
    replace('.', '').\
    replace(':', '')
  )

def makeAscii(r):
  for (rin, rout) in transAscii.items():
    r = r.replace(rin, rout)
  return r.lower()

In [5]:
with open(SIGN_PATH) as fh:
  signs = json.load(fh)['signs']

print(f'{len(signs)} signs')

mapping = collections.defaultdict(set)

for (sign, signData) in signs.items():
  uniStr = signData['signCunei']
  values = signData['values']
  for value in values:
    valueAscii = makeAscii(value)
    mapping[valueAscii].add(uniStr)

print(f'{len(mapping)} values in table')

1768 signs
8765 values in table


In [6]:
readings = set(F.reading.v(s) for s in F.otype.s('sign')) - {None}
len(readings)

745

In [8]:
unmapped = set()
unique = {}
multiple = {}

for r in readings:
  if r not in mapping:
    unmapped.add(r)
    continue
  targets = mapping[r]
  if len(targets) == 1:
    unique[r] = list(targets)[0]
  else:
    multiple[r] = targets
    
print(f'{len(unmapped):>3} unmapped readings')
print(f'{len(multiple):>3} ambiguously mapped readings')
print(f'{len(unique):>3} uniquely mapped readings')

 80 unmapped readings
 41 ambiguously mapped readings
624 uniquely mapped readings
