In [69]:
import os
import collections
import pickle
from unicodedata import name as uname

In [79]:
BASE = os.path.expanduser("~/github")
ORG = "Nino-cunei"
REPO = "oldbabylonian"
VERSION = "0.2"

REPO_PATH = f"{BASE}/{ORG}/{REPO}"
MAP_FILE_T = f"{REPO_PATH}/sources/writing/signs.txt"
MAP_FILE_P = f"{REPO_PATH}/sources/writing/signs.p"

In [78]:
def checkSignMapData(path):
    with open(MAP_FILE_P, "rb") as fh:
        pData = pickle.load(fh)

    def compare(col1, col2):
        p1Data = pData[col1]
        p2Data = pData[col2]
        diff = False
        for i in range(len(p1Data)):
            if p1Data[i] != p2Data[i]:
                print(f"row {i} is different")
                print(p1Data[i])
                print(p2Data[i])
                diff = True
                break
        if not diff:
            print(f"columns {col1} and {col2} are equal")
        return not diff

    good = True
    for (c1, c2) in ((0, c) for c in range(1, len(pData))):
        if not compare(c1, c2):
            good = False

    if not good or not len(pData):
        print("No data delivered")
        return None

    data = pData[0]
    lData = len(data)

    if not lData:
        print("Data is empty")
        return None

    with open(MAP_FILE_T, "w") as tfh:
        for row in data:
            rowStr = "\t".join(row)
            tfh.write(f"{rowStr}\n")
    print(f"Written data to {MAP_FILE_T}")

    headers = data.pop(0)
    lData -= 1
    print(f"headers = {headers}")

    if not lData:
        print("No rows")
        return None

    batch = 10

    langs = {row[3] for row in data}
    print(f"Data has {lData} rows")
    print(f"Data has languages {langs}")

    # check whether forms are unique

    def checkUnique(cols, per=None):
        if type(cols) is int:
            cols = (cols,)
        colNames = ",".join(headers[col] for col in cols)
        values = set()
        duplicates = set()

        if per is None:
            indent = ""
            chunks = {None: data}
        else:
            indent = "\t"
            chunks = collections.defaultdict(list)
            for row in data:
                chunks[row[per]].append(row)

        for (perVal, rows) in sorted(chunks.items()):
            if perVal is not None:
                print(f'rows with {headers[per]} = "{perVal}"')
            for row in rows:
                value = ",".join(row[col] for col in cols)
                dest = duplicates if value in values else values
                dest.add(value)

            if duplicates:
                lDups = len(duplicates)
                print(f"{indent}found {lDups} duplicate {colNames}s:")
                rest = "" if lDups <= batch else " and more"
                dupStr = " ".join(sorted(duplicates)[0:batch])
                print(f"{indent}\t{dupStr}{rest}")
            else:
                print(f"{indent}no duplicate {colNames}s")

    checkUnique(0)
    checkUnique(2)
    checkUnique((0, 1))
    checkUnique((0, 1, 3))
    checkUnique(0, per=3)

    for row in data[0:batch]:
        print(row)
    print(f"\n... {lData - 2 * batch} rows ...\n")
    for row in data[-batch:]:
        print(row)

    return data


data = checkSignMapData(MAP_FILE)  # noqa F821

columns 0 and 1 are equal
columns 0 and 2 are equal
Written data to /Users/dirk/github/Nino-cunei/oldbabylonian/sources/writing/signs.txt
headers = ['value', 'form', 'character', 'language']
Data has 9907 rows
Data has languages {'sux'}
found 446 duplicate values:
	...Ac ...ingara /cumun/ 1(car)u) 1/3(dic@c) 1/4 2/3(dic@c) 4(dic@v) 4(dic@v@c) KWU127~a and more
found 1311 duplicate characters:
	░░ 𒀀 𒀀𒀀 𒀀𒀭 𒀀𒁺 𒀀𒂔 𒀀𒂔𒇲 𒀀𒃼 𒀀𒄠 𒀀𒄩 and more
found 75 duplicate value,forms:
	4(dic@v),LIMMU 4(dic@v@c),LIMMU KWU127~a,|ZI&ZI| LAK469~a,|ZI&ZI| MZL101~a,|ZI&ZI| arata,|LAM×(KUR.RU)| at,AD at2,GIR₂@g bat,BAD bit,E₂ and more
found 75 duplicate value,form,languages:
	4(dic@v),LIMMU,sux 4(dic@v@c),LIMMU,sux KWU127~a,|ZI&ZI|,sux LAK469~a,|ZI&ZI|,sux MZL101~a,|ZI&ZI|,sux arata,|LAM×(KUR.RU)|,sux at,AD,sux at2,GIR₂@g,sux bat,BAD,sux bit,E₂,sux and more
rows with language = "sux"
	found 446 duplicate values:
		...Ac ...ingara /cumun/ 1(car)u) 1/3(dic@c) 1/4 2/3(dic@c) 4(dic@v) 4(dic@v@c) KWU127~a and more
['LA

In [42]:
def makeMap(data):
    pass

In [17]:
cuneiBlocks = {
    "Cuneiform": ("12000", "123FF"),
    "Cuneiform Numbers and Punctuation": ("12400", "1247F"),
    "Early Dynastic Cuneiform": ("12480", "1254F"),
}

In [None]:
cNumber = dict(
    one=1,
    two=2,
    three=3,
    four=4,
    five=5,
    six=6,
    seven=7,
    eight=8,
    nine=9,
)

numericGlyphs = set(
    """
  ash
  ash9
  ban2
  buru
  dish
  eshe3
  esh16
  esh21
  gesh2
  geshu
  ilimmu
  ilimmu3
  ilimmu4
  imin
  imin3
  limmu
  limmu4
  shar2
  sharu
  u
  ussu
  ussu3
""".strip().split()
)

fractions = dict(
    half=2,
    third=3,
    thirds=3,
    quarter=4,
    sixths=6,
    eighth=8,
)

In [63]:
pos = 0
nChars = 0

noUni = []

glyphs = collections.defaultdict(list)

for (cuneiBlock, (start, end)) in cuneiBlocks.items():
    for u in range(int(start, 16), int(end, 16) + 1):
        pos += 1
        c = chr(u)
        name = uname(c, None)
        if name is None:
            noUni.append(u)
            continue
        nChars += 1
        if not name.startswith("CUNEIFORM "):
            glyphs["no"].append(u)
        parts = [p.lower() for p in name.split()][1:]
        kind = "other"
        if parts[0] == "sign":
            kind = "sign"
            parts = parts[1:]
        elif parts[0] == "numeric":
            kind = "number"
            parts = parts[1:]
            if parts[0] != "sign":
                kind = "odd"
            else:
                parts = parts[1:]
                num = cNumber.get(parts[0], None)
                variant = ""
                if num is None:
                    kind = "numberSpecial"
                else:
                    parts[0] = str(num)
                    parts = parts[1:]
                    if "variant" in parts and "form" in parts:
                        variant = "~"
                        kind = "numberVar"
                        parts.remove("variant")
                        parts.remove("form")
                    if (
                        len(parts) == 0
                        or parts[0] not in numericGlyphs
                        or len(parts) > 1
                    ):
                        if len(parts) == 2:
                            if parts[1] in {"a", "b"}:
                                variant = f"~{parts[1]}"
                                kind = "numberVar"
                                parts = parts[0:-1]
                            elif parts[1] == "tenu":
                                pass
                            else:
                                kind = "odd"
                        else:
                            kind = "odd"
                    if variant:
                        parts.append(variant)
        elif parts[0] == "punctuation":
            kind = "punct"
            parts = parts[1:]
            if parts[0] != "sign":
                kind = "odd"
            else:
                parts = parts[1:]

        glyphs[kind].append((u, " ".join(parts)))

print(f"{pos} positions; {nChars} cuneiform characters")
if pos - nChars:
    print(f"{pos - nChars} skipped positions")

for (kind, unis) in sorted(glyphs.items()):
    print(f"{kind}: {len(unis)}")
    for (u, shortName) in unis[0:20]:
        c = chr(u)
        print(f"{u:>03x} = {c} = {shortName} <= {uname(c)}")

1360 positions; 1234 cuneiform characters
126 skipped positions
number: 65
12400 = 𒐀 = ash <= CUNEIFORM NUMERIC SIGN TWO ASH
12401 = 𒐁 = ash <= CUNEIFORM NUMERIC SIGN THREE ASH
12402 = 𒐂 = ash <= CUNEIFORM NUMERIC SIGN FOUR ASH
12403 = 𒐃 = ash <= CUNEIFORM NUMERIC SIGN FIVE ASH
12404 = 𒐄 = ash <= CUNEIFORM NUMERIC SIGN SIX ASH
12405 = 𒐅 = ash <= CUNEIFORM NUMERIC SIGN SEVEN ASH
12406 = 𒐆 = ash <= CUNEIFORM NUMERIC SIGN EIGHT ASH
12407 = 𒐇 = ash <= CUNEIFORM NUMERIC SIGN NINE ASH
12408 = 𒐈 = dish <= CUNEIFORM NUMERIC SIGN THREE DISH
12409 = 𒐉 = dish <= CUNEIFORM NUMERIC SIGN FOUR DISH
1240a = 𒐊 = dish <= CUNEIFORM NUMERIC SIGN FIVE DISH
1240b = 𒐋 = dish <= CUNEIFORM NUMERIC SIGN SIX DISH
1240c = 𒐌 = dish <= CUNEIFORM NUMERIC SIGN SEVEN DISH
1240d = 𒐍 = dish <= CUNEIFORM NUMERIC SIGN EIGHT DISH
1240e = 𒐎 = dish <= CUNEIFORM NUMERIC SIGN NINE DISH
1240f = 𒐏 = u <= CUNEIFORM NUMERIC SIGN FOUR U
12410 = 𒐐 = u <= CUNEIFORM NUMERIC SIGN FIVE U
12411 = 𒐑 = u <= CUNEIFORM NUMERIC SIGN SIX U
124

In [45]:
uname("a")

'LATIN SMALL LETTER A'