In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import collections
import pickle
import gzip

from tf.app import use

In [3]:
A = use('oldbabylonian', hoist=globals(), check=True)

TF app is up-to-date.
Using annotation/app-oldbabylonian commit 1f12c687368dec8eabefe35264a30f4d5eac3fb4 (=latest)
  in /Users/dirk/text-fabric-data/__apps__/oldbabylonian.
No new data release available online.
Using Nino-cunei/oldbabylonian/tf - 1.0.1 rv1.0.1 (=latest) in /Users/dirk/text-fabric-data.


# Parallels

We make edges between similar lines.

When are lines similar?

If a certain distance metric is above a certain threshold.

We choose this metric:

* we reduce a line to the set of readings and graphemes in it, excluding unknown signs and ellipses.
* the similarite between two lines is the length of the intersection divided by the length of the union of their sets times 100.

# Preparation

We pre-compute all sets for all lines.

In [4]:
READABLE_TYPES = {'reading', 'grapheme', 'numeral', 'complex'}

def makeSet(l):
  if F.lnc.v(l): # comment line
    return None
  lineSet = set()
  for s in L.d(l, otype='sign'):
    if F.type.v(s) in READABLE_TYPES:
      r = F.readingr.v(s)
      if r:
        lineSet.add(r)
      g = F.graphemer.v(s)
      if g:
        lineSet.add(g)
  return lineSet

In [5]:
lines = {}

for l in F.otype.s('line'):
  lineSet = makeSet(l)
  if lineSet:
    lines[l] = lineSet
    
nLines = len(lines)
print(f'{nLines} lines')

25923 lines


# Measure

In [6]:
def sim(lSet, mSet):
  return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))

# Compute all similarities

We are going to perform several hundreds of millions of comparisons, each of which is more than an elemetary operation.

Let's measure time.

In [7]:
THRESHOLD = 90

def computeSim(limit=None):
  similarity = {}

  lineNodes = sorted(lines.keys())
  nLines = len(lineNodes)

  nComparisons = nLines * (nLines - 1) // 2

  print(f'{nComparisons} comparisons to make')
  chunkSize = nComparisons // 100

  co = 0
  b = 0
  si = 0
  p = 0

  indent(reset=True)

  stop = False
  for i in range(nLines):
    nodeI = lineNodes[i]
    lineI = lines[nodeI]
    for j in range(i + 1, nLines):
      nodeJ = lineNodes[j]
      lineJ = lines[nodeJ]
      s = sim(lineI, lineJ)
      co += 1
      b += 1
      if b == chunkSize:
        p += 1
        info(f'{p:>3}% - {co:>12} comparisons and {si:>10} similarities')
        b = 0
        if limit is not None and p >= limit:
          stop = True
          break

      if s < THRESHOLD:
        continue
      similarity[(nodeI, nodeJ)] = sim(lineI, lineJ)
      si += 1
    if stop:
      break

  info(f'{p:>3}% - {co:>12} comparisons and {si:>10} similarities')
  return similarity

We are going to run it to 3% first and do some checks then.

In [8]:
similarity = computeSim(limit=3)

335988003 comparisons to make
  4.45s   1% -      3359880 comparisons and       5695 similarities
  8.60s   2% -      6719760 comparisons and      10604 similarities
    13s   3% -     10079640 comparisons and      16028 similarities
    13s   3% -     10079640 comparisons and      16028 similarities


We check the sanity of the results.

In [9]:
print(min(similarity.values()))
print(max(similarity.values()))

90
100


In [10]:
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 90]

In [11]:
print(len(eq))
print(len(neq))

16006
11


In [12]:
print(eq[0])
print(neq[0])

((230787, 235393), 100)
((230796, 230810), 90)


In [13]:
A.plain(eq[0][0][0])
A.plain(eq[0][0][1])

In [14]:
A.plain(neq[0][0][0])
A.plain(neq[0][0][1])

Looks good.

Now the whole computation.

But if we have done this before, and nothing has changed, we load previous results from disk.

If we do not find previous results, we compute them and save the results to disk.

In [15]:
PARA_DIR = f'{A.tempDir}/parallels'

def writeResults(data, location, name):
  if not os.path.exists(location):
    os.makedirs(location, exist_ok=True)
  path = f'{location}/{name}'
  with gzip.open(path, 'wb') as f:
    pickle.dump(data, f)
  print(f'Data written to {path}')
  
def readResults(location, name):
  path = f'{location}/{name}'
  if not os.path.exists(path):
    print(f'File not found: {path}')
    return None
  with gzip.open(path, 'rb') as f:
    data = pickle.load(f)
  print(f'Data read from {path}')
  return data

In [16]:
similarity = readResults(PARA_DIR, 'sim.zip')
if not similarity:
  similarity = computeSim()
  writeResults(similarity, PARA_DIR, 'sim.zip')

Data read from /Users/dirk/text-fabric-data/Nino-cunei/oldbabylonian/_temp/parallels/sim.zip


In [17]:
len(similarity)

574579

So, over half a million pairs of similar lines.

Let's find out which lines have the most correspondences.

In [18]:
parallels = {}

for (l, m) in similarity:
  parallels.setdefault(l, set()).add(m)
  parallels.setdefault(m, set()).add(l)
  
print(f'{len(parallels)} out of {nLines} lines have at least one similar line')

8344 out of 25923 lines have at least one similar line


In [19]:
rankedParallels = sorted(
  parallels.items(),
  key=lambda x: (-len(x[1]), x[0]),
)

In [23]:
for (l, paras) in rankedParallels[0:10]:
  print(f'{len(paras):>4} siblings of {l} = {T.text(l)}')

1005 siblings of 230788 = qi2-bi2-[ma]
1005 siblings of 230824 = qi2-bi2-ma
1005 siblings of 230842 = [qi2]-bi2#-ma
1005 siblings of 230906 = qi2-bi2-ma
1005 siblings of 230914 = qi2-bi2-ma
1005 siblings of 231002 = qi2-bi2-ma
1005 siblings of 231020 = qi2-bi2-ma
1005 siblings of 231053 = [qi2]-bi2-ma
1005 siblings of 231066 = qi2-bi2-ma
1005 siblings of 231086 = [qi2]-bi2#-ma


In [22]:
for (l, paras) in rankedParallels[1000:1010]:
  print(f'{len(paras):>4} siblings of {T.text(l)}')

1005 siblings of qi2-bi2-ma
1005 siblings of qi2-bi2-ma
1005 siblings of qi2-bi2-ma
1005 siblings of qi2-bi2-ma
1005 siblings of qi2-bi2-ma
1005 siblings of qi2-[bi2]-ma
 128 siblings of {d}utu u3 {d}marduk li-ba-al-li-t,u2-ka
 128 siblings of {d}utu u3 {d}marduk li-ba-al-li-t,u2-ka
 128 siblings of {d}utu u3 {d}marduk li-ba-al#-li#-t,u2#-[ka]
 128 siblings of [{d}utu u3 {d}]marduk# li#-[ba]-al#-[li]-t,u2#-ka#


In [24]:
for (l, paras) in rankedParallels[1130:1140]:
  print(f'{len(paras):>4} siblings of {T.text(l)}')

 128 siblings of _{d}utu_ u3 _{d}marduk_ li-ba-al-li-t,u2-ka
 128 siblings of _{d}utu_ u3 _{d}marduk_ li-ba-al#-li#-t,u2#-ka#
 128 siblings of _{d}utu_ u3 _{d}marduk_ li-ba-al-li-t,u2-ka#
 127 siblings of {d}utu u3 {d}marduk li-ba-al-li-t,u2-ka!(KI)
 127 siblings of {d}utu u3 {d}marduk tu-ba-al-li-t,u2-ka
 124 siblings of [li]-ba-[al]-li-[t,u2-ka]
 124 siblings of li-ba-al-li-t,u2-ka
 124 siblings of li-ba-al-li#-t,u2-ka
 124 siblings of li-ba-al-li-t,u2-ka
 124 siblings of li-ba-al-li-t,u2-ka


And how many lines have just one correspondence?

We look at the tail of rankedParallels.

In [25]:
pairs = [(x, list(paras)[0]) for (x, paras) in rankedParallels if len(paras) == 1]
print(f'There are {len(pairs)} exclusively parallel pairs of lines')

There are 1994 exclusively parallel pairs of lines


Why not make an overview of exactly how wide-spread parallel lines are?

We count how many lines have how many parallels.

In [28]:
parallelCount = collections.Counter()

buckets = (2, 10, 20, 50, 100)

bucketRep = {}
prevBucket = None
for bucket in buckets:
  if prevBucket is None:
    bucketRep[bucket] = f'       n <= {bucket:>3}'
  elif bucket == buckets[-1]:
    bucketRep[bucket] = f'       n >  {bucket:>3}'
  else:
    bucketRep[bucket] = f'{prevBucket:>3} <  n <= {bucket:>3}'
  prevBucket = bucket

for (l, paras) in rankedParallels:
  clusterSize = len(paras) + 1
  if clusterSize > buckets[-1]:
    theBucket = buckets[-1]
  else:
    for bucket in buckets:
      if clusterSize <= bucket:
        theBucket = bucket
        break
  parallelCount[theBucket] += 1
  
for (bucket, amount) in sorted(
  parallelCount.items(),
  key=lambda x: (-x[0], x[1]),
):
  print(f'{amount:>4} lines have n sisters where {bucketRep[bucket]}')

1972 lines have n sisters where        n >  100
 939 lines have n sisters where  20 <  n <=  50
 793 lines have n sisters where  10 <  n <=  20
2646 lines have n sisters where   2 <  n <=  10
1994 lines have n sisters where        n <=   2


# Add parallels to the TF dataset

We can add this information to the Oldbabylonian dataset as an *edge feature*.

An edge feature links two nodes and may annotate that link with a value.

For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between
the two lines. The similarity is a percentage, and we round it to integer values.

If *n1* is similar to *n2*, then *n2* is similar to *n1*.
In order to save space, we only add such links once.

We can then use
[`E.sim.b(node)`](https://annotation.github.io/text-fabric/Api/Features/#edge-features)
to find all nodes that are parallel to node.


In [29]:
metaData = {
  '': {
    'name': 'AbB Old Babylonian Cuneiform',
    'editor': 'Cale Johnson et. al.',
    'institute': 'CDL',
    'converters': 'Cale Johnson, Dirk Roorda',    
  },
  'sim': {
    'valueType': 'int',
    'edgeValues': True,
    'description': 'similarity between lines, as a percentage of the common material wrt the combined material',
  },
}

In [30]:
simData = {}
for ((f, t), d) in similarity.items():
  simData.setdefault(f, {})[t] = d

In [34]:
ghBase = os.path.expanduser('~/github')
subdir = 'parallels'
path = f'{A.org}/{A.repo}/{subdir}/tf'
location = f'{ghBase}/{path}'
module = A.version

In [35]:
TF.save(edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module)

   |     0.52s T sim                  to /Users/dirk/github/Nino-cunei/oldbabylonian/parallels/tf/1.0.1


True

# Turn the parallels feature into a module

The new `sim` feature is a big data feature. You do not want to load it all the time.

Here we show how to turn it into a module, so that users can easily load it in a Jupyter notebook or in the TF browser.

**N.B. The next cell shows how to construct a module. Since this module exists, there is no point for you to redo it.
Later on we show how to use this module.**

In [39]:
%%bash
text-fabric-zip 'Nino-cunei/oldbabylonian/parallels/tf'

True
Create release data for Nino-cunei/oldbabylonian/parallels/tf
zip files end up in /Users/dirk/Downloads/Nino-cunei-release/oldbabylonian
zipping Nino-cunei/oldbabylonian  1.0.1 with   1 features ==> parallels-tf-1.0.1.zip


I have added this file to a new release of the Oldbabylonian Github repo.

# Use the parallels module

We load the Oldbabylonian corpus again, but now with the parallels module.

In [41]:
TF = use('oldbabylonian', hoist=globals(), check=True, mod='Nino-cunei/oldbabylonian/parallels/tf')

TF app is up-to-date.
Using annotation/app-oldbabylonian commit 1f12c687368dec8eabefe35264a30f4d5eac3fb4 (=latest)
  in /Users/dirk/text-fabric-data/__apps__/oldbabylonian.
	downloading Nino-cunei/oldbabylonian - 1.0.1 rv1.1
	from https://github.com/Nino-cunei/oldbabylonian/releases/download/v1.1/tf-1.0.1.zip ...
	unzipping ...
	saving Nino-cunei/oldbabylonian - 1.0.1 rv1.1
	saved Nino-cunei/oldbabylonian - 1.0.1 rv1.1
Using Nino-cunei/oldbabylonian/tf - 1.0.1 rv1.1 (=latest) in /Users/dirk/text-fabric-data
	downloading Nino-cunei/oldbabylonian - 1.0.1 rv1.1
	from https://github.com/Nino-cunei/oldbabylonian/releases/download/v1.1/parallels-tf-1.0.1.zip ...
	unzipping ...
	saving Nino-cunei/oldbabylonian - 1.0.1 rv1.1
	saved Nino-cunei/oldbabylonian - 1.0.1 rv1.1
Using Nino-cunei/oldbabylonian/parallels/tf - 1.0.1 rv1.1 (=latest) in /Users/dirk/text-fabric-data
   |      |     0.09s C __levels__           from otype, oslots, otext
   |      |     1.56s C __order__            from otype,

Lo and behold: you see the parallels module li