In [2]:
%pylab inline
## -*- coding: utf-8 -*-

Populating the interactive namespace from numpy and matplotlib


In [341]:
import re
import json
import time
import copy
import xml.etree.ElementTree as ET

In [4]:
tropnames = {'etnakhta': u'\u0591', 'segol': u'\u0592', 'shalshelet': u'\u0593', 'katan': u'\u0594', 'gadol': u'\u0595',
             'tipkha': u'\u0596', 'revii': u'\u0597', 'tsinnorit': u'\u0598', 'pashta': u'\u0599', 'yetiv': u'\u059a', 'tevir': u'\u059b',
             'geresh': u'\u059c', 'gereshmukdam': u'\u059d', 'gershayim': u'\u059e', 'karnepara': u'\u059f', 'telishagedola': u'\u05a0',
             'pazer': u'\u05a1', 'munakh': u'\u05a3', 'mapakh': u'\u05a4', 'merkha': u'\u05a5',
             'merkhakfula': u'\u05a6', 'darga': u'\u05a7', 'kadma': u'\u05a8', 'telishaketana': u'\u05a9', 'yerakhbenyomo': u'\u05aa',
             'sofpasuk': u'\u05c3', 'zarka': u'\u05ae'}

# per wikipedia: Note that both marks have been wrongly named by Unicode.[5][6] Zarqa/tsinnor corresponds to Unicode
# "Hebrew accent zinor", code point U+05AE (where "zinor" is a misspelled form for tsinnor), while tsinnorit maps to
# "Hebrew accent zarqa", code point U+0598. 

tropchars = {tropnames[t]: t for t in tropnames}

## Parse

In [427]:
sfarim = ['bereshit', 'shmot', 'vayikra', 'bmidbar', 'dvarim']

tropstrings = {}
for sefer in sfarim:
    tropstrings[sefer] = {}
    tree = ET.parse(sefer + '.xml')
    root = tree.getroot() 
    prakim = root.findall('.//c')
    for perek in prakim:
        pereknum = int(perek.attrib['n'])
        if pereknum not in tropstrings[sefer]: tropstrings[sefer][pereknum] = {}
        psukim = perek.findall('v')
        for pasuk in psukim:
            pasuknum = int(pasuk.attrib['n'])
            if pasuknum not in tropstrings[sefer][pereknum]:
                # row = {name: 0 for name in tropnames.keys()}
                # row['sefer'] = sefer
                # row['pasuk'] = pasuknum
                # row['perek'] = pereknum
                tropstrings[sefer][pereknum][pasuknum] = ''
#             words = pasuk.findall('w') # TODO: also look for <q> tags for qtiv
            words = [w for w in pasuk if w.tag=='w' or w.tag=='q']
            wordslist = map(lambda w: list(w.text), words)
            troplist = map(lambda w: filter(lambda c: c in tropchars, w), wordslist)
            troplist = filter(lambda w: len(w) > 0, troplist)
            # there's probably a better way to do this, but I can't think of it
            flattroplist = []
            for w in troplist:
                for c in w:
                    flattroplist.append(c)
            tropstrings[sefer][pereknum][pasuknum] += ''.join(flattroplist)

## Build the tree (and the seqsearch helper)

In [497]:
def seqsearch(seq, s):
#     print len(sources)
    seqcount = 0
    
    seqlist = ','.join(map(lambda c: tropchars[c], list(seq)))
    if seqlist not in sequencecountsbypasuk:
        sequencecountsbypasuk[seqlist] = {}
        
    if s != []:
        newsources = []
        for source in s:
#             print sources
            found = re.findall(seq, tropstrings[source[0]][source[1]][source[2]])
            seqcount += len(found)
            if len(found) > 0: newsources.append(source)
                
            if source[0] not in sequencecountsbypasuk[seqlist]: sequencecountsbypasuk[seqlist][source[0]] = {}
            if source[1] not in sequencecountsbypasuk[seqlist][source[0]]: sequencecountsbypasuk[seqlist][source[0]][source[1]] = {}
#             if source[2] not in sequencecountsbypasuk[seq][source[0]][source[1]]: sequencecountsbypasuk[seq][source[0]][source[1]][source[2]] = {}
            sequencecountsbypasuk[seqlist][source[0]][source[1]][source[2]] = len(found)
        if len(newsources) > 0: s = newsources
    else:
        for sefer in tropstrings:
            for perek in tropstrings[sefer]:
                for pasuk in tropstrings[sefer][perek]:
                    found = re.findall(seq, tropstrings[sefer][perek][pasuk])
                    seqcount += len(found)
                    if len(found) > 0: s.append((sefer, perek, pasuk))
                        
                    if sefer not in sequencecountsbypasuk[seqlist]: sequencecountsbypasuk[seqlist][sefer] = {}
                    if perek not in sequencecountsbypasuk[seqlist][sefer]: sequencecountsbypasuk[seqlist][sefer][perek] = {}
#                     if pasuk not in sequencecountsbypasuk[seq][sefer][perek][pasuk]: sequencecountsbypasuk[seq][sefer][perek][pasuk] = {}
                    sequencecountsbypasuk[seqlist][sefer][perek][pasuk] = len(found)
    return (seqcount, s)

In [429]:
map(lambda t: tropchars[t], tropstrings['dvarim'][16][18])

['munakh',
 'revii',
 'pashta',
 'katan',
 'kadma',
 'darga',
 'tevir',
 'merkha',
 'tipkha',
 'etnakhta',
 'merkha',
 'tipkha',
 'sofpasuk']

In [498]:
seqsearch(tropnames['merkha']+tropnames['tipkha'], [])

(5884,
 [('vayikra', 1, 1),
  ('vayikra', 1, 3),
  ('vayikra', 1, 4),
  ('vayikra', 1, 5),
  ('vayikra', 1, 6),
  ('vayikra', 1, 7),
  ('vayikra', 1, 9),
  ('vayikra', 1, 10),
  ('vayikra', 1, 13),
  ('vayikra', 1, 14),
  ('vayikra', 1, 16),
  ('vayikra', 1, 17),
  ('vayikra', 2, 1),
  ('vayikra', 2, 2),
  ('vayikra', 2, 3),
  ('vayikra', 2, 4),
  ('vayikra', 2, 5),
  ('vayikra', 2, 6),
  ('vayikra', 2, 7),
  ('vayikra', 2, 9),
  ('vayikra', 2, 10),
  ('vayikra', 2, 11),
  ('vayikra', 2, 12),
  ('vayikra', 2, 13),
  ('vayikra', 2, 14),
  ('vayikra', 2, 15),
  ('vayikra', 3, 1),
  ('vayikra', 3, 5),
  ('vayikra', 3, 6),
  ('vayikra', 3, 7),
  ('vayikra', 3, 9),
  ('vayikra', 3, 11),
  ('vayikra', 3, 12),
  ('vayikra', 3, 16),
  ('vayikra', 3, 17),
  ('vayikra', 4, 1),
  ('vayikra', 4, 4),
  ('vayikra', 4, 5),
  ('vayikra', 4, 8),
  ('vayikra', 4, 12),
  ('vayikra', 4, 13),
  ('vayikra', 4, 14),
  ('vayikra', 4, 15),
  ('vayikra', 4, 16),
  ('vayikra', 4, 19),
  ('vayikra', 4, 21),
  ('v

In [396]:
def getcounts(seq, sources):
    parent = []
    for trop in tropnames:
        thisseq = seq+tropnames[trop]
#         seqlist = map(lambda c: tropchars[c], list(thisseq)) # for printing
#         print "Searching for " + str(seqlist) + " in " + str(len(sources)) + " psukim"
        seqcount, localsources = seqsearch(thisseq, copy.copy(sources))
#         print "Found " + str(seqcount) + " in " + str(len(sources)) + " psukim"
        if seqcount > 0:
            parent.append({'name': tropchars[tropnames[trop]], 'char': tropnames[trop], 'count': seqcount});
            parent[len(parent)-1]['children'] = getcounts(thisseq, localsources)
#         sources = [] # if you finish all of the trop, go back to looking in all. stupid way to make this happen.
    return parent

In [527]:
# this is going to be an object keyed on sequence,sefer,perek,pasuk. it will be aggregated by perek later
sequencecountsbypasuk = {}

start = time.time()
tree = getcounts('', [])
print time.time() - start

309.38421011


In [395]:
tree['munakh']['children']['revii']['children']['pashta']['children']['katan']['count']

92

In [333]:
sum([len(tropstrings[s]) for s in tropstrings])

192

### Output the json

In [431]:
with open('sequencetree-d3format.json', 'wb') as outfile:
    json.dump(tree, outfile)

## Sources

In [451]:
import pandas as pd

In [466]:
sfarim = ['bereshit', 'shmot', 'vayikra', 'bmidbar', 'dvarim']

In [528]:
flatcounts = [{'seq': seq, 'sefer': sefer, 'perek': perek, 'pasuk': pasuk, 'count': sequencecountsbypasuk[seq][sefer][perek][pasuk], 'totaltrop': len(tropstrings[sefer][perek][pasuk])} for seq in sequencecountsbypasuk for sefer in sequencecountsbypasuk[seq] for perek in sequencecountsbypasuk[seq][sefer] for pasuk in sequencecountsbypasuk[seq][sefer][perek]]
df = pd.DataFrame(flatcounts)
df.set_index(['seq', 'sefer', 'perek', 'pasuk'], inplace=True)
df = df.reindex(sfarim, level=1)

In [529]:
byperek = df.groupby(level=['seq','sefer','perek']).aggregate(sum)

In [530]:
byperek['norm'] = byperek['count']/byperek['totaltrop']

In [577]:
bypereklist = []
for row in byperek.loc['munakh,revii'].iterrows():
    bypereklist.append({'index': row[0][0] + ',' + str(row[0][1]), 'count': row[1]['count'], 'norm': row[1]['norm']})

In [578]:
with open('byperek_munakhrevii.json', 'wb') as outfile:
    json.dump(bypereklist, outfile)