In [1]:
import collections

from tf.app import use

In [2]:
A = use('oldbabylonian', hoist=globals(), check=True)

TF app is up-to-date.
Using annotation/app-oldbabylonian commit 1f12c687368dec8eabefe35264a30f4d5eac3fb4 (=latest)
  in /Users/dirk/text-fabric-data/__apps__/oldbabylonian.
No new data release available online.
Using Nino-cunei/oldbabylonian/tf - 1.0.1 rv1.0.1 (=latest) in /Users/dirk/text-fabric-data.


# Parallels

We make edges between similar lines.

When are lines similar?

If a certain distance metric is above a certain threshold.

We choose this metric:

* we reduce a line to the set of readings and graphemes in it, excluding unknown signs and ellipses.
* the similarite between two lines is the length of the intersection divided by the length of the union of their sets times 100.

# Preparation

We pre-compute all sets for all lines.

In [3]:
READABLE_TYPES = {'reading', 'grapheme', 'numeral', 'complex'}

def makeSet(l):
  if F.lnc.v(l): # comment line
    return None
  lineSet = set()
  for s in L.d(l, otype='sign'):
    if F.type.v(s) in READABLE_TYPES:
      r = F.readingr.v(s)
      if r:
        lineSet.add(r)
      g = F.graphemer.v(s)
      if g:
        lineSet.add(g)
  return lineSet

In [10]:
lines = {}

for l in F.otype.s('line'):
  lineSet = makeSet(l)
  if lineSet:
    lines[l] = makeSet(l)
    
print(f'{len(lines)} lines')

25923 lines


# Measure

In [8]:
def sim(lSet, mSet):
  return 100 * len(lSet & mSet) / len(lSet | mSet)

# Compute all similarities

We are going to perform millions of comparisons, each of which is more than an elemetary operation.

Let's measure time.

In [51]:
THRESHOLD = 90

def computeSim(limit=None):
  similarity = {}

  lineNodes = sorted(lines.keys())
  nLines = len(lineNodes)

  nComparisons = nLines * (nLines - 1) // 2

  print(f'{nComparisons} comparisons to make')
  chunkSize = nComparisons // 100

  co = 0
  b = 0
  si = 0
  p = 0

  indent(reset=True)

  stop = False
  for i in range(nLines):
    nodeI = lineNodes[i]
    lineI = lines[nodeI]
    for j in range(i + 1, nLines):
      nodeJ = lineNodes[j]
      lineJ = lines[nodeJ]
      s = sim(lineI, lineJ)
      co += 1
      b += 1
      if b == chunkSize:
        p += 1
        info(f'{p:>3}% - {co:>12} comparisons and {si:>10} similarities')
        b = 0
        if limit is not None and p >= limit:
          stop = True
          break

      if s < THRESHOLD:
        continue
      similarity[(nodeI, nodeJ)] = sim(lineI, lineJ)
      si += 1
    if stop:
      break

  info(f'{p:>3}% - {co:>12} comparisons and {si:>10} similarities')
  return similarity

We are going to run it to 3% first and do some checks then.

In [52]:
similarity = computeSim(limit=3)

335988003 comparisons to make
  3.71s   1% -      3359880 comparisons and       5695 similarities
  7.16s   2% -      6719760 comparisons and      10604 similarities
    11s   3% -     10079640 comparisons and      16028 similarities
    11s   3% -     10079640 comparisons and      16028 similarities


We check the sanity of the results.

In [53]:
print(min(similarity.values()))
print(max(similarity.values()))

90.0
100.0


In [54]:
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 90]

In [55]:
print(len(eq))
print(len(neq))

16006
11


In [56]:
print(eq[0])
print(neq[0])

((230787, 235393), 100.0)
((230796, 230810), 90.0)


In [57]:
A.plain(eq[0][0][0])
A.plain(eq[0][0][1])

In [58]:
A.plain(neq[0][0][0])
A.plain(neq[0][0][1])

Looks good.

Now the whole computation:

In [39]:
similarity = computeSim()

335988003 comparisons to make
  3.53s   1% -      3359880 comparisons and       5695 similarities
  7.04s   2% -      6719760 comparisons and      10604 similarities
    11s   3% -     10079640 comparisons and      16028 similarities
    14s   4% -     13439520 comparisons and      24005 similarities
    17s   5% -     16799400 comparisons and      30694 similarities
    21s   6% -     20159280 comparisons and      37992 similarities
    24s   7% -     23519160 comparisons and      44479 similarities
    28s   8% -     26879040 comparisons and      49298 similarities
    31s   9% -     30238920 comparisons and      56259 similarities
    35s  10% -     33598800 comparisons and      59520 similarities
    38s  11% -     36958680 comparisons and      64009 similarities
    41s  12% -     40318560 comparisons and      69139 similarities
    45s  13% -     43678440 comparisons and      71511 similarities
    48s  14% -     47038320 comparisons and      75137 similarities
    52s  15% -    

So, over half a million pairs of similar lines.

Let's find out which lines have the most correspondences.

In [42]:
parallels = {}

for (l, m) in similarity:
  parallels.setdefault(l, set()).add(m)

In [43]:
rankedParallels = sorted(
  parallels.items(),
  key=lambda x: (-len(x[1]), x[0]),
)

In [46]:
for (l, paras) in rankedParallels[0:10]:
  print(f'{len(paras):>4} siblings of {T.text(l)}')

1005 siblings of qi2-bi2-[ma]
1004 siblings of qi2-bi2-ma
1003 siblings of [qi2]-bi2#-ma
1002 siblings of qi2-bi2-ma
1001 siblings of qi2-bi2-ma
1000 siblings of qi2-bi2-ma
 999 siblings of qi2-bi2-ma
 998 siblings of [qi2]-bi2-ma
 997 siblings of qi2-bi2-ma
 996 siblings of [qi2]-bi2#-ma


In [48]:
for (l, paras) in rankedParallels[1000:1010]:
  print(f'{len(paras):>4} siblings of {T.text(l)}')

  91 siblings of um-ma ha-am-mu-ra-bi-ma
  91 siblings of li-ba-al-li-t,u2-ka
  91 siblings of {d}utu u3 {d}marduk li-ba-al-li#-[t,u2-ka]
  91 siblings of qi2#-bi2-ma#
  90 siblings of um-ma ha-am-mu-ra-bi-ma
  90 siblings of li-ba-al-li-t,u2#-ka#
  90 siblings of {d}utu u3 {d}marduk li-ba-al-li-t,u2-<ka>
  90 siblings of qi2-bi2-ma
  89 siblings of um#-ma ha-am-mu-ra-bi-ma
  89 siblings of li-ba-al-li-t,u2-ka


This is just the beginning!