In [1]:
%pylab inline
## -*- coding: utf-8 -*-

Populating the interactive namespace from numpy and matplotlib


In [2]:
import re
import json
import time
import copy
import xml.etree.ElementTree as ET

In [3]:
tropnames = {'etnakhta': u'\u0591', 'segol': u'\u0592', 'shalshelet': u'\u0593', 'katan': u'\u0594', 'gadol': u'\u0595',
             'tipkha': u'\u0596', 'revii': u'\u0597', 'tsinnorit': u'\u0598', 'pashta': u'\u0599', 'yetiv': u'\u059a', 'tevir': u'\u059b',
             'geresh': u'\u059c', 'gereshmukdam': u'\u059d', 'gershayim': u'\u059e', 'karnepara': u'\u059f', 'telishagedola': u'\u05a0',
             'pazer': u'\u05a1', 'munakh': u'\u05a3', 'mapakh': u'\u05a4', 'merkha': u'\u05a5',
             'merkhakfula': u'\u05a6', 'darga': u'\u05a7', 'kadma': u'\u05a8', 'telishaketana': u'\u05a9', 'yerakhbenyomo': u'\u05aa',
             'sofpasuk': u'\u05c3', 'zarka': u'\u05ae'}

# sof pasuk is U+05C3, siluk is U+05BD

# per wikipedia: Note that both marks have been wrongly named by Unicode.[5][6] Zarqa/tsinnor corresponds to Unicode
# "Hebrew accent zinor", code point U+05AE (where "zinor" is a misspelled form for tsinnor), while tsinnorit maps to
# "Hebrew accent zarqa", code point U+0598. 

tropchars = {tropnames[t]: t for t in tropnames}

## Parse

In [54]:
sfarim = ['bereshit', 'shmot', 'vayikra', 'bmidbar', 'dvarim']

tropstrings = {}

for sefer in sfarim:
    tropstrings[sefer] = {}
    tree = ET.parse(sefer + '.xml')
    root = tree.getroot() 
    prakim = root.findall('.//c')
    for perek in prakim:
        pereknum = int(perek.attrib['n'])
        if pereknum not in tropstrings[sefer]: tropstrings[sefer][pereknum] = {}
        psukim = perek.findall('v')
        for pasuk in psukim:
            pasuknum = int(pasuk.attrib['n'])
            if pasuknum not in tropstrings[sefer][pereknum]:
                # row = {name: 0 for name in tropnames.keys()}
                # row['sefer'] = sefer
                # row['pasuk'] = pasuknum
                # row['perek'] = pereknum
                tropstrings[sefer][pereknum][pasuknum] = ''
#             words = pasuk.findall('w') # TODO: also look for <q> tags for qtiv
            words = [w for w in pasuk if w.tag=='w' or w.tag=='q']
            wordslist = map(lambda w: list(w.text), words)
#             if sefer == 'shmot' and pereknum == 2 and pasuknum == 5:
#                 thewordlist = wordslist
            troplist = map(lambda w: filter(lambda c: c in tropchars, w), wordslist)
            # handle stress doublings
            for w in troplist:
                if len(w) == 2:
                    if(w[0] == w[1]):
                        del(w[1])
            troplist = filter(lambda w: len(w) > 0, troplist)
            # there's probably a better way to do this, but I can't think of it
            flattroplist = []
            for w in troplist:
                for c in w:
                    if c == tropnames['tsinnorit']: c = tropnames['zarka']
                    if c == tropnames['gereshmukdam']: c = tropnames['geresh']
                    flattroplist.append(c)
#             if sefer == "bereshit" and pereknum == 2 and pasuknum == 4:
#                 print flattroplist
#             if flattroplist[-1] != tropnames['sofpasuk']:
#                 print sefer, pereknum, pasuknum,flattroplist
            tropstrings[sefer][pereknum][pasuknum] += ''.join(flattroplist)

## Build the tree (and the seqsearch helper)

### Output the json

## Sources

I really wish I understood why byperek.index.levels[0], sequencecountsbypasuk.keys(), and the 'seq' element of flatcounts are all different. Suspicious.

## Make full index for sparse javascript array

In [743]:
indexstring = '[' + ','.join(['"' + row[0][0] + ',' + str(row[0][1]) + '"' for row in pereklengths.iterrows()]) + ']'

In [744]:
indexstring

'["bereshit,1","bereshit,2","bereshit,3","bereshit,4","bereshit,5","bereshit,6","bereshit,7","bereshit,8","bereshit,9","bereshit,10","bereshit,11","bereshit,12","bereshit,13","bereshit,14","bereshit,15","bereshit,16","bereshit,17","bereshit,18","bereshit,19","bereshit,20","bereshit,21","bereshit,22","bereshit,23","bereshit,24","bereshit,25","bereshit,26","bereshit,27","bereshit,28","bereshit,29","bereshit,30","bereshit,31","bereshit,32","bereshit,33","bereshit,34","bereshit,35","bereshit,36","bereshit,37","bereshit,38","bereshit,39","bereshit,40","bereshit,41","bereshit,42","bereshit,43","bereshit,44","bereshit,45","bereshit,46","bereshit,47","bereshit,48","bereshit,49","bereshit,50","shmot,1","shmot,2","shmot,3","shmot,4","shmot,5","shmot,6","shmot,7","shmot,8","shmot,9","shmot,10","shmot,11","shmot,12","shmot,13","shmot,14","shmot,15","shmot,16","shmot,17","shmot,18","shmot,19","shmot,20","shmot,21","shmot,22","shmot,23","shmot,24","shmot,25","shmot,26","shmot,27","shmot,28","shmot,2

## Aseret hadibrot and other data cleaning

In [55]:
# bmidbar 25:19. sefaria has it appended to the beginning of bmidbar 26:1
tropstrings['bmidbar'][26][1] = tropstrings['bmidbar'][25][19] + tropstrings['bmidbar'][26][1]
del tropstrings['bmidbar'][25][19]
# map(lambda t: tropchars[t], tropstrings['bmidbar'][26][1])

In [56]:
# missing sof pasuks
for sefer in tropstrings:
    for perek in tropstrings[sefer]:
        for pasuk in tropstrings[sefer][perek]:
            if tropstrings[sefer][perek][pasuk][-1] != tropnames['sofpasuk']:
#                 print sefer,perek,pasuk,map(lambda c: tropchars[c], tropstrings[sefer][perek][pasuk])
                tropstrings[sefer][perek][pasuk] += tropnames['sofpasuk']

In [57]:
newshmot20 = {}

newshmot20[1] = tropstrings['shmot'][20][1]
newshmot20[2] = tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['munakh'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['munakh'] + tropnames['revii']
newshmot20[3] = tropnames['munakh'] + tropnames['munakh'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['revii']
newshmot20[4] = tropnames['munakh'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['pazer'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['pazer'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['merkha'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['revii']
newshmot20[5] = tropnames['munakh'] + tropnames['zarka'] + tropnames['munakh'] + tropnames['segol'] + tropnames['munakh'] + tropnames['gershayim'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['telishagedola'] + tropnames['kadma'] + tropnames['darga'] + tropnames['tevir'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['etnakhta']
newshmot20[6] = tropnames['mapakh'] + tropnames['pashta'] + tropnames['katan'] + tropnames['tipkha'] + tropnames['merkha'] + tropnames['sofpasuk']
newshmot20[7] = tropnames['merkha'] + tropnames['tevir'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['etnakhta'] + tropnames['munakh'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['katan'] + tropnames['tevir'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['sofpasuk']
newshmot20[8] = tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['revii']
newshmot20[9] = tropnames['munakh'] + tropnames['munakh'] + tropnames['zarka'] + tropnames['munakh'] + tropnames['segol']
newshmot20[10] = tropnames['kadma'] + tropnames['geresh'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['revii'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['pazer'] + tropnames['munakh'] + tropnames['telishagedola'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['revii'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan']
# newshmot20[11] = tropnames['munakh'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['munakh'] + tropnames['revii'] + tropnames['pashta'] + tropnames['katan'] + tropnames['tipkha'] + tropnames['munakh'] + tropnames['etnakhta']
newshmot20[11] = tropstrings['shmot'][20][11]
newshmot20[12] = tropstrings['shmot'][20][12]
newshmot20[13] = tropnames['tipkha'] + tropnames['sofpasuk'] + tropnames['tipkha'] + tropnames['sofpasuk'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['merkha'] + tropnames['sofpasuk']
newshmot20[14] = tropnames['gershayim'] + tropnames['munakh'] + tropnames['revii'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['tipkha'] + tropnames['merkha'] + tropnames['sofpasuk']

for p in tropstrings['shmot'][20]:
    if p >= 18:
        newshmot20[p-3] = tropstrings['shmot'][20][p]

tropstrings['shmot'][20] = newshmot20

In [58]:
# tropstrings['dvarim'][5]

newdvarim5 = {}

for p in tropstrings['dvarim'][5]:
    if p <= 5:
        newdvarim5[p] = tropstrings['dvarim'][5][p]

# newdvarim5[5] = tropnames['telishagedola'] + tropnames['kadma'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katon'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['munakh'] + tropnames['etnakhta'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katon'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['sofpasuk']
newdvarim5[6] = tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['munakh'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['munakh'] + tropnames['revii']
newdvarim5[7] = tropnames['munakh'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['revii']
newdvarim5[8] = tropnames['munakh'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['pazer'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['pazer'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['merkha'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['revii']
newdvarim5[9] = tropnames['munakh'] + tropnames['zarka'] + tropnames['munakh'] + tropnames['segol'] + tropnames['munakh'] + tropnames['gershayim'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['telishagedola'] + tropnames['kadma'] + tropnames['darga'] + tropnames['tevir'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['etnakhta']
newdvarim5[10] = tropnames['mapakh'] + tropnames['pashta'] + tropnames['katan'] + tropnames['tipkha'] + tropnames['merkha'] + tropnames['sofpasuk']
newdvarim5[11] = tropstrings['dvarim'][5][11]
newdvarim5[12] = tropnames['munakh'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['merkha'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['revii']
newdvarim5[13] = tropnames['munakh'] + tropnames['munakh'] + tropnames['zarka'] + tropnames['munakh'] + tropnames['segol']
newdvarim5[14] = tropnames['kadma'] + tropnames['geresh'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['revii'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['pazer'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['telishagedola'] + tropnames['kadma'] + tropnames['geresh'] + tropnames['revii'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['revii'] + tropnames['tevir'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['etnakhta']
newdvarim5[15] = tropnames['gershayim'] + tropnames['merkha'] + tropnames['munakh'] + tropnames['munakh'] + tropnames['revii'] + tropnames['telishaketana'] + tropnames['kadma'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['revii'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['tipkha'] + tropnames['merkha'] + tropnames['sofpasuk']
newdvarim5[16] = tropstrings['dvarim'][5][16]
newdvarim5[17] = tropnames['tipkha'] + tropnames['sofpasuk'] + tropnames['tipkha'] + tropnames['sofpasuk'] + tropnames['tipkha'] + tropnames['sofpasuk'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['merkha'] + tropnames['sofpasuk']

for p in tropstrings['dvarim'][5]:
    if p >= 21:
        newdvarim5[p-3] = tropstrings['dvarim'][5][p]

tropstrings['dvarim'][5] = newdvarim5

In [59]:
# bereshit 35:22
tropstrings['bereshit'][35][22] = tropnames['revii'] + tropnames['mapakh'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['munakh'] + tropnames['revii'] + tropnames['pashta'] + tropnames['pashta'] + tropnames['pashta'] + tropnames['munakh'] + tropnames['katan'] + tropnames['tipkha'] + tropnames['etnakhta'] + tropnames['merkha'] + tropnames['tipkha'] + tropnames['merkha'] + tropnames['sofpasuk']

In [60]:
# dvarim 13:15
newdvarim1315 = list(tropstrings['dvarim'][13][15])
newdvarim1315[1] = tropnames['tevir']
tropstrings['dvarim'][13][15] = ''.join(newdvarim1315)

In [61]:
# shmot 4:10
newshmot410 = list(tropstrings['shmot'][4][10])
newshmot410.insert(15,tropnames['tipkha'])
tropstrings['shmot'][4][10] = ''.join(newshmot410)

## Nevermind all that stuff up there. Just make an array of strings

In [62]:
flattropstrings = [{'sefer': s, 'perek': pe, 'pasuk': pa, 'trop': tropstrings[s][pe][pa]} for s in tropstrings for pe in tropstrings[s] for pa in tropstrings[s][pe]]

In [63]:
with open('tropstrings.json', 'wb') as outfile:
    json.dump(flattropstrings, outfile)

In [30]:
json.dumps([{"name": n, "char": tropnames[n]} for n in tropnames])

'[{"char": "\\u0597", "name": "revii"}, {"char": "\\u059d", "name": "gereshmukdam"}, {"char": "\\u05a6", "name": "merkhakfula"}, {"char": "\\u059e", "name": "gershayim"}, {"char": "\\u059b", "name": "tevir"}, {"char": "\\u059f", "name": "karnepara"}, {"char": "\\u0595", "name": "gadol"}, {"char": "\\u05a0", "name": "telishagedola"}, {"char": "\\u0599", "name": "pashta"}, {"char": "\\u0593", "name": "shalshelet"}, {"char": "\\u0596", "name": "tipkha"}, {"char": "\\u059a", "name": "yetiv"}, {"char": "\\u0592", "name": "segol"}, {"char": "\\u05aa", "name": "yerakhbenyomo"}, {"char": "\\u05ae", "name": "zarka"}, {"char": "\\u05a3", "name": "munakh"}, {"char": "\\u05a5", "name": "merkha"}, {"char": "\\u05a8", "name": "kadma"}, {"char": "\\u0591", "name": "etnakhta"}, {"char": "\\u05c3", "name": "sofpasuk"}, {"char": "\\u0598", "name": "tsinnorit"}, {"char": "\\u059c", "name": "geresh"}, {"char": "\\u05a9", "name": "telishaketana"}, {"char": "\\u05a7", "name": "darga"}, {"char": "\\u05a1", "

In [20]:
tropnames['tipkha'] + tropnames['merkha']

u'\u0596\u05a5'

In [27]:
tropchars[u'\u05a3'] + tropchars[u'\u0597']

'munakhrevii'