In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import collections
import pickle
import gzip

from tf.app import use

In [3]:
A = use("Nino-cunei/ninmed:clone", checkout="clone", hoist=globals())

This is Text-Fabric 9.2.5
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

45 features found and 0 ignored


# Parallels

We make edges between similar lines.

When are lines similar?

If a certain distance metric is above a certain threshold.

We choose this metric:

* we reduce a line to the set of readings and graphemes in it, excluding unknown signs and ellipses.
* the similarity between two lines is the length of the intersection divided by the length of the union of their sets times 100.

# Preparation

We pre-compute all sets for all lines.

In [4]:
READABLE_TYPES = {"reading", "grapheme"}


def makeSet(ln):
    lineSet = set()
    for s in L.d(ln, otype="sign"):
        if F.type.v(s) in READABLE_TYPES:
            r = F.reading.v(s)
            if r:
                lineSet.add(r)
            g = F.grapheme.v(s)
            if g:
                lineSet.add(g)
    return lineSet

In [5]:
lines = {}

for ln in F.otype.s("line"):
    lineSet = makeSet(ln)
    if lineSet:
        lines[ln] = lineSet

nLines = len(lines)
print(f"{nLines} lines")

2920 lines


# Measure

In [6]:
def sim(lSet, mSet):
    return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))

# Compute all similarities

We are going to perform several hundreds of millions of comparisons, each of which is more than an elemetary operation.

Let's measure time.

In [7]:
THRESHOLD = 80


def computeSim(limit=None):
    similarity = {}

    lineNodes = sorted(lines.keys())
    nLines = len(lineNodes)

    nComparisons = nLines * (nLines - 1) // 2

    print(f"{nComparisons} comparisons to make")
    chunkSize = nComparisons // 100

    co = 0
    b = 0
    si = 0
    p = 0

    A.indent(reset=True)

    stop = False
    for i in range(nLines):
        nodeI = lineNodes[i]
        lineI = lines[nodeI]
        for j in range(i + 1, nLines):
            nodeJ = lineNodes[j]
            lineJ = lines[nodeJ]
            s = sim(lineI, lineJ)
            co += 1
            b += 1
            if b == chunkSize:
                p += 1
                A.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities")
                b = 0
                if limit is not None and p >= limit:
                    stop = True
                    break

            if s < THRESHOLD:
                continue
            similarity[(nodeI, nodeJ)] = sim(lineI, lineJ)
            si += 1
        if stop:
            break

    A.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities")
    return similarity

We are going to run it to 2% first and do some checks then.

In [8]:
similarity = computeSim(limit=2)

4261740 comparisons to make
  0.07s   1% -        42617 comparisons and          6 similarities
  0.13s   2% -        85234 comparisons and         10 similarities
  0.13s   2% -        85234 comparisons and         10 similarities


We check the sanity of the results.

In [9]:
print(min(similarity.values()))
print(max(similarity.values()))

80
100


In [10]:
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 90]

In [11]:
print(len(eq))
print(len(neq))

9
1


In [12]:
print(eq[0])
print(neq[0])

((85973, 85982), 100)
((85977, 88607), 80)


In [13]:
A.plain(eq[0][0][0])
A.plain(eq[0][0][1])

In [14]:
A.plain(neq[0][0][0])
A.plain(neq[0][0][1])

Looks good.

Now the whole computation.

But if we have done this before, and nothing has changed, we load previous results from disk.

If we do not find previous results, we compute them and save the results to disk.

In [15]:
PARA_DIR = f"{A.tempDir}/parallels"


def writeResults(data, location, name):
    if not os.path.exists(location):
        os.makedirs(location, exist_ok=True)
    path = f"{location}/{name}"
    with gzip.open(path, "wb") as f:
        pickle.dump(data, f)
    print(f"Data written to {path}")


def readResults(location, name):
    path = f"{location}/{name}"
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    with gzip.open(path, "rb") as f:
        data = pickle.load(f)
    print(f"Data read from {path}")
    return data

In [16]:
similarity = readResults(PARA_DIR, f"sim-{A.version}.zip")
if not similarity:
    similarity = computeSim()
    writeResults(similarity, PARA_DIR, f"sim-{A.version}.zip")

File not found: /Users/werk/github/Nino-cunei/ninmed/_temp/parallels/sim-0.2.zip
4261740 comparisons to make
  0.08s   1% -        42617 comparisons and          6 similarities
  0.13s   2% -        85234 comparisons and         10 similarities
  0.19s   3% -       127851 comparisons and         10 similarities
  0.25s   4% -       170468 comparisons and         13 similarities
  0.30s   5% -       213085 comparisons and         15 similarities
  0.37s   6% -       255702 comparisons and         15 similarities
  0.42s   7% -       298319 comparisons and         18 similarities
  0.48s   8% -       340936 comparisons and         20 similarities
  0.55s   9% -       383553 comparisons and         21 similarities
  0.61s  10% -       426170 comparisons and         23 similarities
  0.68s  11% -       468787 comparisons and         23 similarities
  0.74s  12% -       511404 comparisons and         27 similarities
  0.80s  13% -       554021 comparisons and         30 similarities
  0.87s

In [17]:
len(similarity)

626

Let's find out which lines have the most correspondences.

In [18]:
parallels = {}

for (ln, m) in similarity:
    parallels.setdefault(ln, set()).add(m)
    parallels.setdefault(m, set()).add(ln)

print(f"{len(parallels)} out of {nLines} lines have at least one similar line")

492 out of 2920 lines have at least one similar line


In [19]:
rankedParallels = sorted(
    parallels.items(),
    key=lambda x: (-len(x[1]), x[0]),
)

In [20]:
for (ln, paras) in rankedParallels[0:10]:
    print(f"{len(paras):>4} siblings of {ln} = {T.text(ln)}")

  10 siblings of 87687 = [KA.INIM.MA] IGI GIG.GA.A.KAM₂ 
  10 siblings of 87698 = KA.INIM.MA [IGI GIG.GA.A.KAM₂] 
  10 siblings of 87706 = KA.INIM.MA IGI GIG.GA.[KAM₂] 
  10 siblings of 87766 = KA#.INIM#[.MA IGI GIG.GA].A#.KAM₂ 
  10 siblings of 87843 = [KA.INIM.MA] IGI GIG.GA.A.[KAM₂] 
  10 siblings of 87855 = KA.INIM.MA IGI GIG.GA.A.KAM₂ 
  10 siblings of 87863 = KA.[INIM.MA] [IGI GIG.GA.KAM₂] 
  10 siblings of 87871 = [KA.INIM.MA] IGI GIG.[GA.A.KAM₂] 
  10 siblings of 87882 = [KA.INIM.MA ] IGI# GIG.GA.[KAM₂] 
  10 siblings of 87908 = [KA.] INIM#.MA IGI GIG.GA.A.KAM₂ 


In [21]:
for (ln, paras) in rankedParallels[200:210]:
    print(f"{len(paras):>4} siblings of {T.text(ln)}")

   1 siblings of %sux [x x x x gi] pisan#-gen₇ keš₂-da 
   1 siblings of [x x GAZI] sar# SILA₁₁-aš LAL-ma UD.3.KAM₂# [NU DU₈] 
   1 siblings of [EGIR na]-aṣ-ma-da-te an-na-ti 10 GIN₂ ZA₃.HI.LI ša₂ KA# [x x] 
   1 siblings of [DIŠ NA ] SAG#.KI.DAB.BA TUKU.TUKU [x x x x] 
   1 siblings of [x x x x x x x x x x x x x ŠEŠ₂]-su#-ma# TI 
   1 siblings of [x x x x x x x] ŠEŠ₂-su-ma# [TI] 
   1 siblings of [x x x x x x x x x] tu#-bal ta-sak₃ ina I₃ HI.HI ŠEŠ₂-MEŠ#-su#-[ma TI] 
   1 siblings of [x x x x x x x ] tu#-bal ta-sak₃ ina I₃ HI.HI ŠEŠ₂-MEŠ-su#[-ma TI] 
   1 siblings of DIŠ NA ki-is ŠA₃# [GIG ...] 
   1 siblings of ul-tu x [...] 


In [22]:
for (ln, paras) in rankedParallels[480:490]:
    print(f"{len(paras):>4} siblings of {T.text(ln)}")

   1 siblings of [...] ina# KAŠ.SAG NAG 
   1 siblings of [GUR-ma ] HAD₂#.A GAZ ina A ZU₂.LUM ina dugGAN tara-bak ina TUG₂.HI.A SUR-ri ur-ri u GE₆ LAL 
   1 siblings of ina A GAZIsar tara-bak GUR-ma HAD₂.A GAZ ina A ZU₂.LUM.MA ina dugGAN tara-bak ina TUG₂.HI.A SUR-ri ur-ri u GE₆ [] LAL# 
   1 siblings of DIŠ NA SAG ŠA₃-šu₂ GU₇-šu₂ ina ge-ši-šu₂ ZE₂ im-ta-na-ʾ NA BI qer-be₂-na GIG 
   1 siblings of DUB 2.KAM₂ DIŠ NA su-a-lam GIG ana ki-is ŠA₃ GUR 
   1 siblings of %sux sag sahar₂-ra mu-un#-[dab x x x x x x x] 
   1 siblings of %sux šu du₃.du₃.meš šu ne.ne#[.a.meš] eridu#ki#.ga# mu#.un#.[tum₂ TU₆.EN₂] 
   1 siblings of %sux [] EN₂# sag-ki-ni sag-ki [x x x x] 
   1 siblings of %sux [] mu#-ru-ub-bi-ni mu-ru-ub#[-bi x x x x] 
   1 siblings of %sux [] EN₂# sag-ki#[-ni x x x x x x x x x x x x x x x] 


And how many lines have just one correspondence?

We look at the tail of rankedParallels.

In [23]:
pairs = [(x, list(paras)[0]) for (x, paras) in rankedParallels if len(paras) == 1]
print(f"There are {len(pairs)} exclusively parallel pairs of lines")

There are 293 exclusively parallel pairs of lines


Why not make an overview of exactly how wide-spread parallel lines are?

We count how many lines have how many parallels.

In [24]:
parallelCount = collections.Counter()

buckets = (2, 10, 20, 50, 100)

bucketRep = {}
prevBucket = None
for bucket in buckets:
    if prevBucket is None:
        bucketRep[bucket] = f"       n <= {bucket:>3}"
    elif bucket == buckets[-1]:
        bucketRep[bucket] = f"       n >  {bucket:>3}"
    else:
        bucketRep[bucket] = f"{prevBucket:>3} <  n <= {bucket:>3}"
    prevBucket = bucket

for (ln, paras) in rankedParallels:
    clusterSize = len(paras) + 1
    if clusterSize > buckets[-1]:
        theBucket = buckets[-1]
    else:
        for bucket in buckets:
            if clusterSize <= bucket:
                theBucket = bucket
                break
    parallelCount[theBucket] += 1

for (bucket, amount) in sorted(
    parallelCount.items(),
    key=lambda x: (-x[0], x[1]),
):
    print(f"{amount:>4} lines have n sisters where {bucketRep[bucket]}")

  11 lines have n sisters where  10 <  n <=  20
 188 lines have n sisters where   2 <  n <=  10
 293 lines have n sisters where        n <=   2


# Add parallels to the TF dataset

We can add this information to the Oldbabylonian dataset as an *edge feature*.

An edge feature links two nodes and may annotate that link with a value.

For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between
the two lines. The similarity is a percentage, and we round it to integer values.

If *n1* is similar to *n2*, then *n2* is similar to *n1*.
In order to save space, we only add such links once.

We can then use
[`E.sim.b(node)`](https://annotation.github.io/text-fabric/Api/Features/#edge-features)
to find all nodes that are parallel to node.


In [25]:
metaData = {
    "": {
        "name": "Nineveh Medical Encyclopedia Cuneiform",
        "editor": "Cale Johnson et. al.",
        "institute": "Institut für Wissensgeschichte des Altertums",
        "converters": "Cale Johnson, Dirk Roorda",
    },
    "sim": {
        "valueType": "int",
        "edgeValues": True,
        "description": "similarity between lines, as a percentage of the common material wrt the combined material",
    },
}

In [26]:
simData = {}
for ((f, t), d) in similarity.items():
    simData.setdefault(f, {})[t] = d

In [27]:
ghBase = os.path.expanduser("~/github")
subdir = "parallels"
path = f"{A.context.org}/{A.context.repo}/{subdir}/tf"
location = f"{ghBase}/{path}"
module = A.version

In [28]:
TF.save(
    edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module
)

  0.00s Exporting 0 node and 1 edge and 0 config features to ~/github/Nino-cunei/ninmed/parallels/tf/0.2:
   |     0.00s T sim                  to ~/github/Nino-cunei/ninmed/parallels/tf/0.2
  0.00s Exported 0 node features and 1 edge features and 0 config features to ~/github/Nino-cunei/ninmed/parallels/tf/0.2


True

# Turn the parallels feature into a module

The new `sim` feature is a big data feature. You do not want to load it all the time.

Here we show how to turn it into a module, so that users can easily load it in a Jupyter notebook or in the TF browser.

In [29]:
%%bash
text-fabric-zip 'Nino-cunei/oldbabylonian/parallels/tf'

This is a TF dataset
Create release data for Nino-cunei/oldbabylonian/parallels/tf
Found 4 versions
zip files end up in ~/Downloads/Nino-cunei-release/oldbabylonian
zipping Nino-cunei/oldbabylonian  1.0.1 with   1 features ==> parallels-tf-1.0.1.zip
zipping Nino-cunei/oldbabylonian  1.0.4 with   1 features ==> parallels-tf-1.0.4.zip
zipping Nino-cunei/oldbabylonian  1.0.5 with   1 features ==> parallels-tf-1.0.5.zip
zipping Nino-cunei/oldbabylonian  1.0.6 with   1 features ==> parallels-tf-1.0.6.zip


I have added this file to a new release of the Oldbabylonian Github repo.

# Use the parallels module

See tutorial similarLines.