In [2]:
%pylab inline
## -*- coding: utf-8 -*-

Populating the interactive namespace from numpy and matplotlib


In [341]:
import re
import json
import time
import copy
import xml.etree.ElementTree as ET

In [4]:
tropnames = {'etnakhta': u'\u0591', 'segol': u'\u0592', 'shalshelet': u'\u0593', 'katan': u'\u0594', 'gadol': u'\u0595',
             'tipkha': u'\u0596', 'revii': u'\u0597', 'tsinnorit': u'\u0598', 'pashta': u'\u0599', 'yetiv': u'\u059a', 'tevir': u'\u059b',
             'geresh': u'\u059c', 'gereshmukdam': u'\u059d', 'gershayim': u'\u059e', 'karnepara': u'\u059f', 'telishagedola': u'\u05a0',
             'pazer': u'\u05a1', 'munakh': u'\u05a3', 'mapakh': u'\u05a4', 'merkha': u'\u05a5',
             'merkhakfula': u'\u05a6', 'darga': u'\u05a7', 'kadma': u'\u05a8', 'telishaketana': u'\u05a9', 'yerakhbenyomo': u'\u05aa',
             'sofpasuk': u'\u05c3', 'zarka': u'\u05ae'}

# per wikipedia: Note that both marks have been wrongly named by Unicode.[5][6] Zarqa/tsinnor corresponds to Unicode
# "Hebrew accent zinor", code point U+05AE (where "zinor" is a misspelled form for tsinnor), while tsinnorit maps to
# "Hebrew accent zarqa", code point U+0598. 

tropchars = {tropnames[t]: t for t in tropnames}

In [5]:
sfarim = ['bereshit', 'shmot', 'vayikra', 'bmidbar', 'dvarim']

tropstrings = {}
for sefer in sfarim:
    tropstrings[sefer] = {}
    tree = ET.parse(sefer + '.xml')
    root = tree.getroot() 
    prakim = root.findall('.//c')
    for perek in prakim:
        pereknum = int(perek.attrib['n'])
        if pereknum not in tropstrings[sefer]: tropstrings[sefer][pereknum] = {}
        psukim = perek.findall('v')
        for pasuk in psukim:
            pasuknum = int(pasuk.attrib['n'])
            if pasuknum not in tropstrings[sefer][pereknum]:
                # row = {name: 0 for name in tropnames.keys()}
                # row['sefer'] = sefer
                # row['pasuk'] = pasuknum
                # row['perek'] = pereknum
                tropstrings[sefer][pereknum][pasuknum] = ''
            words = pasuk.findall('w')
            wordslist = map(lambda w: list(w.text), words)
            troplist = map(lambda w: filter(lambda c: c in tropchars, w), wordslist)
            troplist = filter(lambda w: len(w) > 0, troplist)
            # there's probably a better way to do this, but I can't think of it
            flattroplist = []
            for w in troplist:
                for c in w:
                    flattroplist.append(c)
            tropstrings[sefer][pereknum][pasuknum] += ''.join(flattroplist)

In [385]:
def seqsearch(seq, s):
#     print len(sources)
    seqcount = 0
    if s != []:
        newsources = []
        for source in s:
#             print sources
            found = re.findall(seq, tropstrings[source[0]][source[1]][source[2]])
            seqcount += len(found)
            if len(found) > 0: newsources.append(source)
        if len(newsources) > 0: s = newsources
    else:
        for sefer in tropstrings:
            for perek in tropstrings[sefer]:
                for pasuk in tropstrings[sefer][perek]:
                    found = re.findall(seq, tropstrings[sefer][perek][pasuk])
                    seqcount += len(found)
                    if len(found) > 0: s.append((sefer, perek, pasuk))
    return (seqcount, s)

In [393]:
map(lambda t: tropchars[t], tropstrings['dvarim'][16][18])

['munakh',
 'revii',
 'pashta',
 'katan',
 'kadma',
 'darga',
 'tevir',
 'merkha',
 'tipkha',
 'etnakhta',
 'merkha',
 'tipkha',
 'sofpasuk']

In [394]:
seqsearch(tropnames['munakh']+tropnames['revii']+tropnames['pashta']+tropnames['katan'], [])

(92,
 [('vayikra', 2, 2),
  ('vayikra', 4, 21),
  ('vayikra', 7, 37),
  ('vayikra', 11, 10),
  ('vayikra', 14, 45),
  ('vayikra', 15, 25),
  ('vayikra', 18, 26),
  ('vayikra', 19, 25),
  ('vayikra', 24, 14),
  ('vayikra', 24, 23),
  ('vayikra', 25, 16),
  ('vayikra', 26, 41),
  ('vayikra', 26, 43),
  ('bmidbar', 8, 21),
  ('bmidbar', 11, 5),
  ('bmidbar', 11, 24),
  ('bmidbar', 11, 32),
  ('bmidbar', 14, 6),
  ('bmidbar', 15, 11),
  ('bmidbar', 15, 30),
  ('bmidbar', 18, 7),
  ('bmidbar', 20, 10),
  ('bmidbar', 20, 21),
  ('bmidbar', 21, 3),
  ('bmidbar', 22, 25),
  ('bmidbar', 26, 59),
  ('bmidbar', 32, 9),
  ('shmot', 4, 20),
  ('shmot', 6, 23),
  ('shmot', 10, 26),
  ('shmot', 14, 10),
  ('shmot', 20, 11),
  ('shmot', 21, 4),
  ('shmot', 21, 32),
  ('shmot', 22, 24),
  ('shmot', 23, 12),
  ('shmot', 24, 12),
  ('shmot', 25, 12),
  ('shmot', 25, 32),
  ('shmot', 26, 8),
  ('shmot', 28, 30),
  ('shmot', 29, 37),
  ('shmot', 30, 13),
  ('shmot', 31, 7),
  ('shmot', 32, 13),
  ('shmot',

In [379]:
def getcounts(seq, sources):
    parent = {}
    for trop in tropnames:
        thisseq = seq+tropnames[trop]
        seqlist = map(lambda c: tropchars[c], list(thisseq)) # for printing
#         print "Searching for " + str(seqlist) + " in " + str(len(sources)) + " psukim"
        seqcount, localsources = seqsearch(thisseq, copy.copy(sources))
#         print "Found " + str(seqcount) + " in " + str(len(sources)) + " psukim"
        if seqcount > 0:
            parent[trop] = {'count': seqcount}
            parent[trop]['children'] = getcounts(thisseq, localsources)
#         sources = [] # if you finish all of the trop, go back to looking in all. stupid way to make this happen.
    return parent

In [386]:
start = time.time()
tree = getcounts('', [])
print time.time() - start

236.830826044


In [395]:
tree['munakh']['children']['revii']['children']['pashta']['children']['katan']['count']

92

In [333]:
sum([len(tropstrings[s]) for s in tropstrings])

192

In [387]:
with open('sequencetree.json', 'wb') as outfile:
    json.dump(tree, outfile)

It's not a true single tree because each root trop is a seperate tree; there's no single parent. So it seems legit to iterate through the tropnames to start.

The terminating condition is when count on a branch goes to 0. This will always happen once something is added after a *sof pasuk*.

I think in order to keep track of what my sequence is, it makes the most sense to do this depth first.

In [116]:
def getcounts(seq='', parent={}):
#     print parent
    seqlist = map(lambda c: tropchars[c], list(seq)) # for printing
    for trop in tropnames:
        seqcount = 0
#         print map(lambda c: tropchars[c], list(seq))
        seqlist.append(trop)
        print seqlist
        for sefer in tropstrings:
            for perek in tropstrings[sefer]:
                for pasuk in tropstrings[sefer][perek]:
                    found = re.findall('^'+seq+tropnames[trop], tropstrings[sefer][perek][pasuk])
                    seqcount += len(found)
        print seqcount
#         if seqcount == 0:
# #             print parent
#             return
        parent[trop] = {'count': seqcount}
        if seqcount > 0:
            parent[trop]['children'] = getcounts(seq+tropnames[trop])
#             return parent
        else:
            return parent[trop]
    return parent

In [117]:
tree = getcounts()

['revii']
323
['revii', 'revii']
0
['revii', 'gereshmukdam']
0


In [118]:
tree

{'count': 0}

In [44]:
counttree = {}
def getcounts():
    for sefer in tropstrings:
        for perek in tropstrings[sefer]:
            for pasuk in tropstrings[sefer][perek]:
                for i,c in enumerate(tropstrings[sefer][perek][pasuk]):
                    if tropchars[c] in counttree:
                        counttree[tropchars[c]]['count'] += 1

In [45]:
getcounts()

0 ֖
1 ֑
2 ֤
3 ֙
4 ֔
5 ֥
6 ֖
7 ׃
0 ֞
1 ֤
2 ֙
3 ֣
4 ֔
5 ֗
6 ֥
7 ֛
8 ֖
9 ֑
10 ֗
11 ֙
12 ֔
13 ֖
14 ׃
0 ֤
1 ֙
2 ֔
3 ֥
4 ֖
5 ֑
6 ֝
7 ֤
8 ֙
9 ֣
10 ֔
11 ֖
12 ֥
13 ׃
0 ֣
1 ֔
2 ֖
3 ֣
4 ֑
5 ֥
6 ֖
7 ֥
8 ׃
0 ֛
1 ֥
2 ֖
3 ֣
4 ֑
5 ֠
6 ֨
7 ֤
8 ֙
9 ֔
10 ֨
11 ֤
12 ֙
13 ֙
14 ֔
15 ֖
16 ֥
17 ׃
0 ֖
1 ֑
2 ֥
3 ֖
4 ׃
0 ֠
1 ֨
2 ֧
3 ֛
4 ֖
5 ֑
6 ֥
7 ֖
8 ׃
0 ֗
1 ֤
2 ֙
3 ֣
4 ֔
5 ֚
6 ֔
7 ֖
8 ֑
9 ֙
10 ֣
11 ֔
12 ֖
13 ׃
0 ֥
1 ֖
2 ֣
3 ֑
4 ֨
5 ֤
6 ֙
7 ֔
8 ֛
9 ֥
10 ֖
11 ׃
0 ֨
1 ֧
2 ֛
3 ֥
4 ֖
5 ֑
6 ֥
7 ֖
8 ׃
0 ֨
1 ֜
2 ֣
3 ֧
4 ֛
5 ֖
6 ֣
7 ֑
8 ֡
9 ֩
10 ֨
11 ֧
12 ֛
13 ֖
14 ׃
0 ֤
1 ֙
2 ֔
3 ֖
4 ֑
5 ֤
6 ֙
7 ֔
8 ֙
9 ֣
10 ֔
11 ֖
12 ׃
0 ֥
1 ֖
2 ֣
3 ֑
4 ֨
5 ֤
6 ֙
7 ֣
8 ֔
9 ֣
10 ֗
11 ֛
12 ֥
13 ֖
14 ׃
0 ֧
1 ֛
2 ֥
3 ֖
4 ֑
5 ֣
6 ֗
7 ֛
8 ֥
9 ֖
10 ׃
0 ֤
1 ֙
2 ֔
3 ֙
4 ֔
5 ֖
6 ֑
7 ֣
8 ֔
9 ֖
10 ֥
11 ׃
0 ֥
1 ֖
2 ֑
3 ֨
4 ֜
5 ֤
6 ֙
7 ֙
8 ֔
9 ֖
10 ׃
0 ֨
1 ֣
2 ֮
3 ֣
4 ֒
5 ֨
6 ֤
7 ֙
8 ֔
9 ֖
10 ֣
11 ֑
12 ֣
13 ֗
14 ֛
15 ֥
16 ֖
17 ׃
0 ֗
1 ֞
2 ֤
3 ֙
4 ֔
5 ֖
6 ֣
7 ֑
8 ֤
9 ֙
10 ֙
11 ֔
12 ֥
13 ֖
14 ׃
0 ֗
1 ֣
2 ֮
3 ֒
4 ֨
5 ֜
6 ֣
7 ֗
8 ֙
9 ֔
10 ֖
11 ֑
12 ֨
